1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <functional>
154 using namespace llvm;
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162 "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164 "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166 "llvm.loop.vectorize.followup_epilogue";
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176 cl::desc("Loops with a constant trip count that is smaller than this "
177 "value are vectorized only if no scalar iteration overheads "
180 static cl::opt<bool> MaximizeBandwidth(
181 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
182 cl::desc("Maximize bandwidth when selecting vectorization factor which "
183 "will be determined by the smallest type in loop."));
185 static cl::opt<bool> EnableInterleavedMemAccesses(
186 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
187 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
189 /// An interleave-group may need masking if it resides in a block that needs
190 /// predication, or in order to mask away gaps.
191 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
192 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
193 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
195 /// We don't interleave loops with a known constant trip count below this
197 static const unsigned TinyTripCountInterleaveThreshold = 128;
199 static cl::opt<unsigned> ForceTargetNumScalarRegs(
200 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
201 cl::desc("A flag that overrides the target's number of scalar registers."));
203 static cl::opt<unsigned> ForceTargetNumVectorRegs(
204 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
205 cl::desc("A flag that overrides the target's number of vector registers."));
207 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
208 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
209 cl::desc("A flag that overrides the target's max interleave factor for "
212 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
213 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
214 cl::desc("A flag that overrides the target's max interleave factor for "
215 "vectorized loops."));
217 static cl::opt<unsigned> ForceTargetInstructionCost(
218 "force-target-instruction-cost", cl::init(0), cl::Hidden,
219 cl::desc("A flag that overrides the target's expected cost for "
220 "an instruction to a single constant value. Mostly "
221 "useful for getting consistent testing."));
223 static cl::opt<unsigned> SmallLoopCost(
224 "small-loop-cost", cl::init(20), cl::Hidden,
226 "The cost of a loop that is considered 'small' by the interleaver."));
228 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
229 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
230 cl::desc("Enable the use of the block frequency analysis to access PGO "
231 "heuristics minimizing code growth in cold regions and being more "
232 "aggressive in hot regions."));
234 // Runtime interleave loops for load/store throughput.
235 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
236 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
238 "Enable runtime interleaving until load/store ports are saturated"));
240 /// The number of stores in a loop that are allowed to need predication.
241 static cl::opt<unsigned> NumberOfStoresToPredicate(
242 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
243 cl::desc("Max number of stores to be predicated behind an if."));
245 static cl::opt<bool> EnableIndVarRegisterHeur(
246 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
247 cl::desc("Count the induction variable only once when interleaving"));
249 static cl::opt<bool> EnableCondStoresVectorization(
250 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
251 cl::desc("Enable if predication of stores during vectorization."));
253 static cl::opt<unsigned> MaxNestedScalarReductionIC(
254 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
255 cl::desc("The maximum interleave count to use when interleaving a scalar "
256 "reduction in a nested loop."));
258 cl::opt<bool> EnableVPlanNativePath(
259 "enable-vplan-native-path", cl::init(false), cl::Hidden,
260 cl::desc("Enable VPlan-native vectorization path with "
261 "support for outer loop vectorization."));
263 // FIXME: Remove this switch once we have divergence analysis. Currently we
264 // assume divergent non-backedge branches when this switch is true.
265 cl::opt<bool> EnableVPlanPredication(
266 "enable-vplan-predication", cl::init(false), cl::Hidden,
267 cl::desc("Enable VPlan-native vectorization path predicator with "
268 "support for outer loop vectorization."));
270 // This flag enables the stress testing of the VPlan H-CFG construction in the
271 // VPlan-native vectorization path. It must be used in conjuction with
272 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
273 // verification of the H-CFGs built.
274 static cl::opt<bool> VPlanBuildStressTest(
275 "vplan-build-stress-test", cl::init(false), cl::Hidden,
277 "Build VPlan for every supported loop nest in the function and bail "
278 "out right after the build (stress test the VPlan H-CFG construction "
279 "in the VPlan-native vectorization path)."));
281 cl::opt<bool> llvm::EnableLoopInterleaving(
282 "interleave-loops", cl::init(true), cl::Hidden,
283 cl::desc("Enable loop interleaving in Loop vectorization passes"));
284 cl::opt<bool> llvm::EnableLoopVectorization(
285 "vectorize-loops", cl::init(true), cl::Hidden,
286 cl::desc("Run the Loop vectorization passes"));
288 /// A helper function for converting Scalar types to vector types.
289 /// If the incoming type is void, we return void. If the VF is 1, we return
291 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
292 if (Scalar->isVoidTy() || VF == 1)
294 return VectorType::get(Scalar, VF);
297 /// A helper function that returns the type of loaded or stored value.
298 static Type *getMemInstValueType(Value *I) {
299 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
300 "Expected Load or Store instruction");
301 if (auto *LI = dyn_cast<LoadInst>(I))
302 return LI->getType();
303 return cast<StoreInst>(I)->getValueOperand()->getType();
306 /// A helper function that returns true if the given type is irregular. The
307 /// type is irregular if its allocated size doesn't equal the store size of an
308 /// element of the corresponding vector type at the given vectorization factor.
309 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
310 // Determine if an array of VF elements of type Ty is "bitcast compatible"
311 // with a <VF x Ty> vector.
313 auto *VectorTy = VectorType::get(Ty, VF);
314 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
317 // If the vectorization factor is one, we just check if an array of type Ty
318 // requires padding between elements.
319 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
322 /// A helper function that returns the reciprocal of the block probability of
323 /// predicated blocks. If we return X, we are assuming the predicated block
324 /// will execute once for every X iterations of the loop header.
326 /// TODO: We should use actual block probability here, if available. Currently,
327 /// we always assume predicated blocks have a 50% chance of executing.
328 static unsigned getReciprocalPredBlockProb() { return 2; }
330 /// A helper function that adds a 'fast' flag to floating-point operations.
331 static Value *addFastMathFlag(Value *V) {
332 if (isa<FPMathOperator>(V))
333 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
337 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
338 if (isa<FPMathOperator>(V))
339 cast<Instruction>(V)->setFastMathFlags(FMF);
343 /// A helper function that returns an integer or floating-point constant with
345 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
346 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
347 : ConstantFP::get(Ty, C);
352 /// InnerLoopVectorizer vectorizes loops which contain only one basic
353 /// block to a specified vectorization factor (VF).
354 /// This class performs the widening of scalars into vectors, or multiple
355 /// scalars. This class also implements the following features:
356 /// * It inserts an epilogue loop for handling loops that don't have iteration
357 /// counts that are known to be a multiple of the vectorization factor.
358 /// * It handles the code generation for reduction variables.
359 /// * Scalarization (implementation using scalars) of un-vectorizable
361 /// InnerLoopVectorizer does not perform any vectorization-legality
362 /// checks, and relies on the caller to check for the different legality
363 /// aspects. The InnerLoopVectorizer relies on the
364 /// LoopVectorizationLegality class to provide information about the induction
365 /// and reduction variables that were found to a given vectorization factor.
366 class InnerLoopVectorizer {
368 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
369 LoopInfo *LI, DominatorTree *DT,
370 const TargetLibraryInfo *TLI,
371 const TargetTransformInfo *TTI, AssumptionCache *AC,
372 OptimizationRemarkEmitter *ORE, unsigned VecWidth,
373 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
374 LoopVectorizationCostModel *CM)
375 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
376 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
377 Builder(PSE.getSE()->getContext()),
378 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
379 virtual ~InnerLoopVectorizer() = default;
381 /// Create a new empty loop. Unlink the old loop and connect the new one.
382 /// Return the pre-header block of the new loop.
383 BasicBlock *createVectorizedLoopSkeleton();
385 /// Widen a single instruction within the innermost loop.
386 void widenInstruction(Instruction &I);
388 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
389 void fixVectorizedLoop();
391 // Return true if any runtime check is added.
392 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
394 /// A type for vectorized values in the new loop. Each value from the
395 /// original loop, when vectorized, is represented by UF vector values in the
396 /// new unrolled loop, where UF is the unroll factor.
397 using VectorParts = SmallVector<Value *, 2>;
399 /// Vectorize a single PHINode in a block. This method handles the induction
400 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
401 /// arbitrary length vectors.
402 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
404 /// A helper function to scalarize a single Instruction in the innermost loop.
405 /// Generates a sequence of scalar instances for each lane between \p MinLane
406 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
408 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
409 bool IfPredicateInstr);
411 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
412 /// is provided, the integer induction variable will first be truncated to
413 /// the corresponding type.
414 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
416 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
417 /// vector or scalar value on-demand if one is not yet available. When
418 /// vectorizing a loop, we visit the definition of an instruction before its
419 /// uses. When visiting the definition, we either vectorize or scalarize the
420 /// instruction, creating an entry for it in the corresponding map. (In some
421 /// cases, such as induction variables, we will create both vector and scalar
422 /// entries.) Then, as we encounter uses of the definition, we derive values
423 /// for each scalar or vector use unless such a value is already available.
424 /// For example, if we scalarize a definition and one of its uses is vector,
425 /// we build the required vector on-demand with an insertelement sequence
426 /// when visiting the use. Otherwise, if the use is scalar, we can use the
427 /// existing scalar definition.
429 /// Return a value in the new loop corresponding to \p V from the original
430 /// loop at unroll index \p Part. If the value has already been vectorized,
431 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
432 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
433 /// a new vector value on-demand by inserting the scalar values into a vector
434 /// with an insertelement sequence. If the value has been neither vectorized
435 /// nor scalarized, it must be loop invariant, so we simply broadcast the
436 /// value into a vector.
437 Value *getOrCreateVectorValue(Value *V, unsigned Part);
439 /// Return a value in the new loop corresponding to \p V from the original
440 /// loop at unroll and vector indices \p Instance. If the value has been
441 /// vectorized but not scalarized, the necessary extractelement instruction
442 /// will be generated.
443 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
445 /// Construct the vector value of a scalarized value \p V one lane at a time.
446 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
448 /// Try to vectorize the interleaved access group that \p Instr belongs to,
449 /// optionally masking the vector operations if \p BlockInMask is non-null.
450 void vectorizeInterleaveGroup(Instruction *Instr,
451 VectorParts *BlockInMask = nullptr);
453 /// Vectorize Load and Store instructions, optionally masking the vector
454 /// operations if \p BlockInMask is non-null.
455 void vectorizeMemoryInstruction(Instruction *Instr,
456 VectorParts *BlockInMask = nullptr);
458 /// Set the debug location in the builder using the debug location in
460 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
462 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
463 void fixNonInductionPHIs(void);
466 friend class LoopVectorizationPlanner;
468 /// A small list of PHINodes.
469 using PhiVector = SmallVector<PHINode *, 4>;
471 /// A type for scalarized values in the new loop. Each value from the
472 /// original loop, when scalarized, is represented by UF x VF scalar values
473 /// in the new unrolled loop, where UF is the unroll factor and VF is the
474 /// vectorization factor.
475 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
477 /// Set up the values of the IVs correctly when exiting the vector loop.
478 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
479 Value *CountRoundDown, Value *EndValue,
480 BasicBlock *MiddleBlock);
482 /// Create a new induction variable inside L.
483 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
484 Value *Step, Instruction *DL);
486 /// Handle all cross-iteration phis in the header.
487 void fixCrossIterationPHIs();
489 /// Fix a first-order recurrence. This is the second phase of vectorizing
491 void fixFirstOrderRecurrence(PHINode *Phi);
493 /// Fix a reduction cross-iteration phi. This is the second phase of
494 /// vectorizing this phi node.
495 void fixReduction(PHINode *Phi);
497 /// The Loop exit block may have single value PHI nodes with some
498 /// incoming value. While vectorizing we only handled real values
499 /// that were defined inside the loop and we should have one value for
500 /// each predecessor of its parent basic block. See PR14725.
503 /// Iteratively sink the scalarized operands of a predicated instruction into
504 /// the block that was created for it.
505 void sinkScalarOperands(Instruction *PredInst);
507 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
509 void truncateToMinimalBitwidths();
511 /// Insert the new loop to the loop hierarchy and pass manager
512 /// and update the analysis passes.
513 void updateAnalysis();
515 /// Create a broadcast instruction. This method generates a broadcast
516 /// instruction (shuffle) for loop invariant values and for the induction
517 /// value. If this is the induction variable then we extend it to N, N+1, ...
518 /// this is needed because each iteration in the loop corresponds to a SIMD
520 virtual Value *getBroadcastInstrs(Value *V);
522 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
523 /// to each vector element of Val. The sequence starts at StartIndex.
524 /// \p Opcode is relevant for FP induction variable.
525 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
526 Instruction::BinaryOps Opcode =
527 Instruction::BinaryOpsEnd);
529 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
530 /// variable on which to base the steps, \p Step is the size of the step, and
531 /// \p EntryVal is the value from the original loop that maps to the steps.
532 /// Note that \p EntryVal doesn't have to be an induction variable - it
533 /// can also be a truncate instruction.
534 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
535 const InductionDescriptor &ID);
537 /// Create a vector induction phi node based on an existing scalar one. \p
538 /// EntryVal is the value from the original loop that maps to the vector phi
539 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
540 /// truncate instruction, instead of widening the original IV, we widen a
541 /// version of the IV truncated to \p EntryVal's type.
542 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
543 Value *Step, Instruction *EntryVal);
545 /// Returns true if an instruction \p I should be scalarized instead of
546 /// vectorized for the chosen vectorization factor.
547 bool shouldScalarizeInstruction(Instruction *I) const;
549 /// Returns true if we should generate a scalar version of \p IV.
550 bool needsScalarInduction(Instruction *IV) const;
552 /// If there is a cast involved in the induction variable \p ID, which should
553 /// be ignored in the vectorized loop body, this function records the
554 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
555 /// cast. We had already proved that the casted Phi is equal to the uncasted
556 /// Phi in the vectorized loop (under a runtime guard), and therefore
557 /// there is no need to vectorize the cast - the same value can be used in the
558 /// vector loop for both the Phi and the cast.
559 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
560 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
562 /// \p EntryVal is the value from the original loop that maps to the vector
563 /// phi node and is used to distinguish what is the IV currently being
564 /// processed - original one (if \p EntryVal is a phi corresponding to the
565 /// original IV) or the "newly-created" one based on the proof mentioned above
566 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
567 /// latter case \p EntryVal is a TruncInst and we must not record anything for
568 /// that IV, but it's error-prone to expect callers of this routine to care
569 /// about that, hence this explicit parameter.
570 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
571 const Instruction *EntryVal,
572 Value *VectorLoopValue,
574 unsigned Lane = UINT_MAX);
576 /// Generate a shuffle sequence that will reverse the vector Vec.
577 virtual Value *reverseVector(Value *Vec);
579 /// Returns (and creates if needed) the original loop trip count.
580 Value *getOrCreateTripCount(Loop *NewLoop);
582 /// Returns (and creates if needed) the trip count of the widened loop.
583 Value *getOrCreateVectorTripCount(Loop *NewLoop);
585 /// Returns a bitcasted value to the requested vector type.
586 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
587 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
588 const DataLayout &DL);
590 /// Emit a bypass check to see if the vector trip count is zero, including if
592 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
594 /// Emit a bypass check to see if all of the SCEV assumptions we've
595 /// had to make are correct.
596 void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
598 /// Emit bypass checks to check any memory assumptions we may have made.
599 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
601 /// Compute the transformed value of Index at offset StartValue using step
603 /// For integer induction, returns StartValue + Index * StepValue.
604 /// For pointer induction, returns StartValue[Index * StepValue].
605 /// FIXME: The newly created binary instructions should contain nsw/nuw
606 /// flags, which can be found from the original scalar operations.
607 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
608 const DataLayout &DL,
609 const InductionDescriptor &ID) const;
611 /// Add additional metadata to \p To that was not present on \p Orig.
613 /// Currently this is used to add the noalias annotations based on the
614 /// inserted memchecks. Use this for instructions that are *cloned* into the
616 void addNewMetadata(Instruction *To, const Instruction *Orig);
618 /// Add metadata from one instruction to another.
620 /// This includes both the original MDs from \p From and additional ones (\see
621 /// addNewMetadata). Use this for *newly created* instructions in the vector
623 void addMetadata(Instruction *To, Instruction *From);
625 /// Similar to the previous function but it adds the metadata to a
626 /// vector of instructions.
627 void addMetadata(ArrayRef<Value *> To, Instruction *From);
629 /// The original loop.
632 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
633 /// dynamic knowledge to simplify SCEV expressions and converts them to a
634 /// more usable form.
635 PredicatedScalarEvolution &PSE;
646 /// Target Library Info.
647 const TargetLibraryInfo *TLI;
649 /// Target Transform Info.
650 const TargetTransformInfo *TTI;
652 /// Assumption Cache.
655 /// Interface to emit optimization remarks.
656 OptimizationRemarkEmitter *ORE;
658 /// LoopVersioning. It's only set up (non-null) if memchecks were
661 /// This is currently only used to add no-alias metadata based on the
662 /// memchecks. The actually versioning is performed manually.
663 std::unique_ptr<LoopVersioning> LVer;
665 /// The vectorization SIMD factor to use. Each vector will have this many
669 /// The vectorization unroll factor to use. Each scalar is vectorized to this
670 /// many different vector instructions.
673 /// The builder that we use
676 // --- Vectorization state ---
678 /// The vector-loop preheader.
679 BasicBlock *LoopVectorPreHeader;
681 /// The scalar-loop preheader.
682 BasicBlock *LoopScalarPreHeader;
684 /// Middle Block between the vector and the scalar.
685 BasicBlock *LoopMiddleBlock;
687 /// The ExitBlock of the scalar loop.
688 BasicBlock *LoopExitBlock;
690 /// The vector loop body.
691 BasicBlock *LoopVectorBody;
693 /// The scalar loop body.
694 BasicBlock *LoopScalarBody;
696 /// A list of all bypass blocks. The first block is the entry of the loop.
697 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
699 /// The new Induction variable which was added to the new block.
700 PHINode *Induction = nullptr;
702 /// The induction variable of the old basic block.
703 PHINode *OldInduction = nullptr;
705 /// Maps values from the original loop to their corresponding values in the
706 /// vectorized loop. A key value can map to either vector values, scalar
707 /// values or both kinds of values, depending on whether the key was
708 /// vectorized and scalarized.
709 VectorizerValueMap VectorLoopValueMap;
711 /// Store instructions that were predicated.
712 SmallVector<Instruction *, 4> PredicatedInstructions;
714 /// Trip count of the original loop.
715 Value *TripCount = nullptr;
717 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
718 Value *VectorTripCount = nullptr;
720 /// The legality analysis.
721 LoopVectorizationLegality *Legal;
723 /// The profitablity analysis.
724 LoopVectorizationCostModel *Cost;
726 // Record whether runtime checks are added.
727 bool AddedSafetyChecks = false;
729 // Holds the end values for each induction variable. We save the end values
730 // so we can later fix-up the external users of the induction variables.
731 DenseMap<PHINode *, Value *> IVEndValues;
733 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
734 // fixed up at the end of vector code generation.
735 SmallVector<PHINode *, 8> OrigPHIsToFix;
738 class InnerLoopUnroller : public InnerLoopVectorizer {
740 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
741 LoopInfo *LI, DominatorTree *DT,
742 const TargetLibraryInfo *TLI,
743 const TargetTransformInfo *TTI, AssumptionCache *AC,
744 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
745 LoopVectorizationLegality *LVL,
746 LoopVectorizationCostModel *CM)
747 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
748 UnrollFactor, LVL, CM) {}
751 Value *getBroadcastInstrs(Value *V) override;
752 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
753 Instruction::BinaryOps Opcode =
754 Instruction::BinaryOpsEnd) override;
755 Value *reverseVector(Value *Vec) override;
758 } // end namespace llvm
760 /// Look for a meaningful debug location on the instruction or it's
762 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
767 if (I->getDebugLoc() != Empty)
770 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
771 if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
772 if (OpInst->getDebugLoc() != Empty)
779 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
780 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
781 const DILocation *DIL = Inst->getDebugLoc();
782 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
783 !isa<DbgInfoIntrinsic>(Inst)) {
784 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
786 B.SetCurrentDebugLocation(NewDIL.getValue());
789 << "Failed to create new discriminator: "
790 << DIL->getFilename() << " Line: " << DIL->getLine());
793 B.SetCurrentDebugLocation(DIL);
795 B.SetCurrentDebugLocation(DebugLoc());
799 /// \return string containing a file name and a line # for the given loop.
800 static std::string getDebugLocString(const Loop *L) {
803 raw_string_ostream OS(Result);
804 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
805 LoopDbgLoc.print(OS);
807 // Just print the module name.
808 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
815 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
816 const Instruction *Orig) {
817 // If the loop was versioned with memchecks, add the corresponding no-alias
819 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
820 LVer->annotateInstWithNoAlias(To, Orig);
823 void InnerLoopVectorizer::addMetadata(Instruction *To,
825 propagateMetadata(To, From);
826 addNewMetadata(To, From);
829 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
831 for (Value *V : To) {
832 if (Instruction *I = dyn_cast<Instruction>(V))
833 addMetadata(I, From);
839 /// LoopVectorizationCostModel - estimates the expected speedups due to
841 /// In many cases vectorization is not profitable. This can happen because of
842 /// a number of reasons. In this class we mainly attempt to predict the
843 /// expected speedup/slowdowns due to the supported instruction set. We use the
844 /// TargetTransformInfo to query the different backends for the cost of
845 /// different operations.
846 class LoopVectorizationCostModel {
848 LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
849 LoopInfo *LI, LoopVectorizationLegality *Legal,
850 const TargetTransformInfo &TTI,
851 const TargetLibraryInfo *TLI, DemandedBits *DB,
853 OptimizationRemarkEmitter *ORE, const Function *F,
854 const LoopVectorizeHints *Hints,
855 InterleavedAccessInfo &IAI)
856 : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
857 AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
859 /// \return An upper bound for the vectorization factor, or None if
860 /// vectorization and interleaving should be avoided up front.
861 Optional<unsigned> computeMaxVF(bool OptForSize);
863 /// \return The most profitable vectorization factor and the cost of that VF.
864 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
865 /// then this vectorization factor will be selected if vectorization is
867 VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
869 /// Setup cost-based decisions for user vectorization factor.
870 void selectUserVectorizationFactor(unsigned UserVF) {
871 collectUniformsAndScalars(UserVF);
872 collectInstsToScalarize(UserVF);
875 /// \return The size (in bits) of the smallest and widest types in the code
876 /// that needs to be vectorized. We ignore values that remain scalar such as
877 /// 64 bit loop indices.
878 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
880 /// \return The desired interleave count.
881 /// If interleave count has been specified by metadata it will be returned.
882 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
883 /// are the selected vectorization factor and the cost of the selected VF.
884 unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
887 /// Memory access instruction may be vectorized in more than one way.
888 /// Form of instruction after vectorization depends on cost.
889 /// This function takes cost-based decisions for Load/Store instructions
890 /// and collects them in a map. This decisions map is used for building
891 /// the lists of loop-uniform and loop-scalar instructions.
892 /// The calculated cost is saved with widening decision in order to
893 /// avoid redundant calculations.
894 void setCostBasedWideningDecision(unsigned VF);
896 /// A struct that represents some properties of the register usage
898 struct RegisterUsage {
899 /// Holds the number of loop invariant values that are used in the loop.
900 unsigned LoopInvariantRegs;
902 /// Holds the maximum number of concurrent live intervals in the loop.
903 unsigned MaxLocalUsers;
906 /// \return Returns information about the register usages of the loop for the
907 /// given vectorization factors.
908 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
910 /// Collect values we want to ignore in the cost model.
911 void collectValuesToIgnore();
913 /// \returns The smallest bitwidth each instruction can be represented with.
914 /// The vector equivalents of these instructions should be truncated to this
916 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
920 /// \returns True if it is more profitable to scalarize instruction \p I for
921 /// vectorization factor \p VF.
922 bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
923 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
925 // Cost model is not run in the VPlan-native path - return conservative
926 // result until this changes.
927 if (EnableVPlanNativePath)
930 auto Scalars = InstsToScalarize.find(VF);
931 assert(Scalars != InstsToScalarize.end() &&
932 "VF not yet analyzed for scalarization profitability");
933 return Scalars->second.find(I) != Scalars->second.end();
936 /// Returns true if \p I is known to be uniform after vectorization.
937 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
941 // Cost model is not run in the VPlan-native path - return conservative
942 // result until this changes.
943 if (EnableVPlanNativePath)
946 auto UniformsPerVF = Uniforms.find(VF);
947 assert(UniformsPerVF != Uniforms.end() &&
948 "VF not yet analyzed for uniformity");
949 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
952 /// Returns true if \p I is known to be scalar after vectorization.
953 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
957 // Cost model is not run in the VPlan-native path - return conservative
958 // result until this changes.
959 if (EnableVPlanNativePath)
962 auto ScalarsPerVF = Scalars.find(VF);
963 assert(ScalarsPerVF != Scalars.end() &&
964 "Scalar values are not calculated for VF");
965 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
968 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
969 /// for vectorization factor \p VF.
970 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
971 return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
972 !isProfitableToScalarize(I, VF) &&
973 !isScalarAfterVectorization(I, VF);
976 /// Decision that was taken during cost calculation for memory instruction.
979 CM_Widen, // For consecutive accesses with stride +1.
980 CM_Widen_Reverse, // For consecutive accesses with stride -1.
986 /// Save vectorization decision \p W and \p Cost taken by the cost model for
987 /// instruction \p I and vector width \p VF.
988 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
990 assert(VF >= 2 && "Expected VF >=2");
991 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
994 /// Save vectorization decision \p W and \p Cost taken by the cost model for
995 /// interleaving group \p Grp and vector width \p VF.
996 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
997 InstWidening W, unsigned Cost) {
998 assert(VF >= 2 && "Expected VF >=2");
999 /// Broadcast this decicion to all instructions inside the group.
1000 /// But the cost will be assigned to one instruction only.
1001 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1002 if (auto *I = Grp->getMember(i)) {
1003 if (Grp->getInsertPos() == I)
1004 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1006 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1011 /// Return the cost model decision for the given instruction \p I and vector
1012 /// width \p VF. Return CM_Unknown if this instruction did not pass
1013 /// through the cost modeling.
1014 InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1015 assert(VF >= 2 && "Expected VF >=2");
1017 // Cost model is not run in the VPlan-native path - return conservative
1018 // result until this changes.
1019 if (EnableVPlanNativePath)
1020 return CM_GatherScatter;
1022 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1023 auto Itr = WideningDecisions.find(InstOnVF);
1024 if (Itr == WideningDecisions.end())
1026 return Itr->second.first;
1029 /// Return the vectorization cost for the given instruction \p I and vector
1031 unsigned getWideningCost(Instruction *I, unsigned VF) {
1032 assert(VF >= 2 && "Expected VF >=2");
1033 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1034 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1035 "The cost is not calculated");
1036 return WideningDecisions[InstOnVF].second;
1039 /// Return True if instruction \p I is an optimizable truncate whose operand
1040 /// is an induction variable. Such a truncate will be removed by adding a new
1041 /// induction variable with the destination type.
1042 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1043 // If the instruction is not a truncate, return false.
1044 auto *Trunc = dyn_cast<TruncInst>(I);
1048 // Get the source and destination types of the truncate.
1049 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1050 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1052 // If the truncate is free for the given types, return false. Replacing a
1053 // free truncate with an induction variable would add an induction variable
1054 // update instruction to each iteration of the loop. We exclude from this
1055 // check the primary induction variable since it will need an update
1056 // instruction regardless.
1057 Value *Op = Trunc->getOperand(0);
1058 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1061 // If the truncated value is not an induction variable, return false.
1062 return Legal->isInductionPhi(Op);
1065 /// Collects the instructions to scalarize for each predicated instruction in
1067 void collectInstsToScalarize(unsigned VF);
1069 /// Collect Uniform and Scalar values for the given \p VF.
1070 /// The sets depend on CM decision for Load/Store instructions
1071 /// that may be vectorized as interleave, gather-scatter or scalarized.
1072 void collectUniformsAndScalars(unsigned VF) {
1073 // Do the analysis once.
1074 if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1076 setCostBasedWideningDecision(VF);
1077 collectLoopUniforms(VF);
1078 collectLoopScalars(VF);
1081 /// Returns true if the target machine supports masked store operation
1082 /// for the given \p DataType and kind of access to \p Ptr.
1083 bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1084 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1087 /// Returns true if the target machine supports masked load operation
1088 /// for the given \p DataType and kind of access to \p Ptr.
1089 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1090 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1093 /// Returns true if the target machine supports masked scatter operation
1094 /// for the given \p DataType.
1095 bool isLegalMaskedScatter(Type *DataType) {
1096 return TTI.isLegalMaskedScatter(DataType);
1099 /// Returns true if the target machine supports masked gather operation
1100 /// for the given \p DataType.
1101 bool isLegalMaskedGather(Type *DataType) {
1102 return TTI.isLegalMaskedGather(DataType);
1105 /// Returns true if the target machine can represent \p V as a masked gather
1106 /// or scatter operation.
1107 bool isLegalGatherOrScatter(Value *V) {
1108 bool LI = isa<LoadInst>(V);
1109 bool SI = isa<StoreInst>(V);
1112 auto *Ty = getMemInstValueType(V);
1113 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1116 /// Returns true if \p I is an instruction that will be scalarized with
1117 /// predication. Such instructions include conditional stores and
1118 /// instructions that may divide by zero.
1119 /// If a non-zero VF has been calculated, we check if I will be scalarized
1120 /// predication for that VF.
1121 bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1123 // Returns true if \p I is an instruction that will be predicated either
1124 // through scalar predication or masked load/store or masked gather/scatter.
1125 // Superset of instructions that return true for isScalarWithPredication.
1126 bool isPredicatedInst(Instruction *I) {
1127 if (!blockNeedsPredication(I->getParent()))
1129 // Loads and stores that need some form of masked operation are predicated
1131 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1132 return Legal->isMaskRequired(I);
1133 return isScalarWithPredication(I);
1136 /// Returns true if \p I is a memory instruction with consecutive memory
1137 /// access that can be widened.
1138 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1140 /// Returns true if \p I is a memory instruction in an interleaved-group
1141 /// of memory accesses that can be vectorized with wide vector loads/stores
1143 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1145 /// Check if \p Instr belongs to any interleaved access group.
1146 bool isAccessInterleaved(Instruction *Instr) {
1147 return InterleaveInfo.isInterleaved(Instr);
1150 /// Get the interleaved access group that \p Instr belongs to.
1151 const InterleaveGroup<Instruction> *
1152 getInterleavedAccessGroup(Instruction *Instr) {
1153 return InterleaveInfo.getInterleaveGroup(Instr);
1156 /// Returns true if an interleaved group requires a scalar iteration
1157 /// to handle accesses with gaps, and there is nothing preventing us from
1158 /// creating a scalar epilogue.
1159 bool requiresScalarEpilogue() const {
1160 return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
1163 /// Returns true if a scalar epilogue is not allowed due to optsize.
1164 bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1166 /// Returns true if all loop blocks should be masked to fold tail loop.
1167 bool foldTailByMasking() const { return FoldTailByMasking; }
1169 bool blockNeedsPredication(BasicBlock *BB) {
1170 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1173 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1174 /// with factor VF. Return the cost of the instruction, including
1175 /// scalarization overhead if it's needed.
1176 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1178 /// Estimate cost of a call instruction CI if it were vectorized with factor
1179 /// VF. Return the cost of the instruction, including scalarization overhead
1180 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1182 /// i.e. either vector version isn't available, or is too expensive.
1183 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1186 unsigned NumPredStores = 0;
1188 /// \return An upper bound for the vectorization factor, larger than zero.
1189 /// One is returned if vectorization should best be avoided due to cost.
1190 unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1192 /// The vectorization cost is a combination of the cost itself and a boolean
1193 /// indicating whether any of the contributing operations will actually
1195 /// vector values after type legalization in the backend. If this latter value
1197 /// false, then all operations will be scalarized (i.e. no vectorization has
1198 /// actually taken place).
1199 using VectorizationCostTy = std::pair<unsigned, bool>;
1201 /// Returns the expected execution cost. The unit of the cost does
1202 /// not matter because we use the 'cost' units to compare different
1203 /// vector widths. The cost that is returned is *not* normalized by
1204 /// the factor width.
1205 VectorizationCostTy expectedCost(unsigned VF);
1207 /// Returns the execution time cost of an instruction for a given vector
1208 /// width. Vector width of one means scalar.
1209 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1211 /// The cost-computation logic from getInstructionCost which provides
1212 /// the vector type as an output parameter.
1213 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1215 /// Calculate vectorization cost of memory instruction \p I.
1216 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1218 /// The cost computation for scalarized memory instruction.
1219 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1221 /// The cost computation for interleaving group of memory instructions.
1222 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1224 /// The cost computation for Gather/Scatter instruction.
1225 unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1227 /// The cost computation for widening instruction \p I with consecutive
1229 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1231 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1232 /// Load: scalar load + broadcast.
1233 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1235 unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1237 /// Estimate the overhead of scalarizing an instruction. This is a
1238 /// convenience wrapper for the type-based getScalarizationOverhead API.
1239 unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1241 /// Returns whether the instruction is a load or store and will be a emitted
1242 /// as a vector operation.
1243 bool isConsecutiveLoadOrStore(Instruction *I);
1245 /// Returns true if an artificially high cost for emulated masked memrefs
1247 bool useEmulatedMaskMemRefHack(Instruction *I);
1249 /// Create an analysis remark that explains why vectorization failed
1251 /// \p RemarkName is the identifier for the remark. \return the remark object
1252 /// that can be streamed to.
1253 OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1254 return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1255 RemarkName, TheLoop);
1258 /// Map of scalar integer values to the smallest bitwidth they can be legally
1259 /// represented as. The vector equivalents of these values should be truncated
1261 MapVector<Instruction *, uint64_t> MinBWs;
1263 /// A type representing the costs for instructions if they were to be
1264 /// scalarized rather than vectorized. The entries are Instruction-Cost
1266 using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1268 /// A set containing all BasicBlocks that are known to present after
1269 /// vectorization as a predicated block.
1270 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1272 /// Records whether it is allowed to have the original scalar loop execute at
1273 /// least once. This may be needed as a fallback loop in case runtime
1274 /// aliasing/dependence checks fail, or to handle the tail/remainder
1275 /// iterations when the trip count is unknown or doesn't divide by the VF,
1276 /// or as a peel-loop to handle gaps in interleave-groups.
1277 /// Under optsize and when the trip count is very small we don't allow any
1278 /// iterations to execute in the scalar loop.
1279 bool IsScalarEpilogueAllowed = true;
1281 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1282 bool FoldTailByMasking = false;
1284 /// A map holding scalar costs for different vectorization factors. The
1285 /// presence of a cost for an instruction in the mapping indicates that the
1286 /// instruction will be scalarized when vectorizing with the associated
1287 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1288 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1290 /// Holds the instructions known to be uniform after vectorization.
1291 /// The data is collected per VF.
1292 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1294 /// Holds the instructions known to be scalar after vectorization.
1295 /// The data is collected per VF.
1296 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1298 /// Holds the instructions (address computations) that are forced to be
1300 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1302 /// Returns the expected difference in cost from scalarizing the expression
1303 /// feeding a predicated instruction \p PredInst. The instructions to
1304 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1305 /// non-negative return value implies the expression will be scalarized.
1306 /// Currently, only single-use chains are considered for scalarization.
1307 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1310 /// Collect the instructions that are uniform after vectorization. An
1311 /// instruction is uniform if we represent it with a single scalar value in
1312 /// the vectorized loop corresponding to each vector iteration. Examples of
1313 /// uniform instructions include pointer operands of consecutive or
1314 /// interleaved memory accesses. Note that although uniformity implies an
1315 /// instruction will be scalar, the reverse is not true. In general, a
1316 /// scalarized instruction will be represented by VF scalar values in the
1317 /// vectorized loop, each corresponding to an iteration of the original
1319 void collectLoopUniforms(unsigned VF);
1321 /// Collect the instructions that are scalar after vectorization. An
1322 /// instruction is scalar if it is known to be uniform or will be scalarized
1323 /// during vectorization. Non-uniform scalarized instructions will be
1324 /// represented by VF values in the vectorized loop, each corresponding to an
1325 /// iteration of the original scalar loop.
1326 void collectLoopScalars(unsigned VF);
1328 /// Keeps cost model vectorization decision and cost for instructions.
1329 /// Right now it is used for memory instructions only.
1330 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1331 std::pair<InstWidening, unsigned>>;
1333 DecisionList WideningDecisions;
1335 /// Returns true if \p V is expected to be vectorized and it needs to be
1337 bool needsExtract(Value *V, unsigned VF) const {
1338 Instruction *I = dyn_cast<Instruction>(V);
1339 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1342 // Assume we can vectorize V (and hence we need extraction) if the
1343 // scalars are not computed yet. This can happen, because it is called
1344 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1345 // the scalars are collected. That should be a safe assumption in most
1346 // cases, because we check if the operands have vectorizable types
1347 // beforehand in LoopVectorizationLegality.
1348 return Scalars.find(VF) == Scalars.end() ||
1349 !isScalarAfterVectorization(I, VF);
1352 /// Returns a range containing only operands needing to be extracted.
1353 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1355 return SmallVector<Value *, 4>(make_filter_range(
1356 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1360 /// The loop that we evaluate.
1363 /// Predicated scalar evolution analysis.
1364 PredicatedScalarEvolution &PSE;
1366 /// Loop Info analysis.
1369 /// Vectorization legality.
1370 LoopVectorizationLegality *Legal;
1372 /// Vector target information.
1373 const TargetTransformInfo &TTI;
1375 /// Target Library Info.
1376 const TargetLibraryInfo *TLI;
1378 /// Demanded bits analysis.
1381 /// Assumption cache.
1382 AssumptionCache *AC;
1384 /// Interface to emit optimization remarks.
1385 OptimizationRemarkEmitter *ORE;
1387 const Function *TheFunction;
1389 /// Loop Vectorize Hint.
1390 const LoopVectorizeHints *Hints;
1392 /// The interleave access information contains groups of interleaved accesses
1393 /// with the same stride and close to each other.
1394 InterleavedAccessInfo &InterleaveInfo;
1396 /// Values to ignore in the cost model.
1397 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1399 /// Values to ignore in the cost model when VF > 1.
1400 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1403 } // end namespace llvm
1405 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1406 // vectorization. The loop needs to be annotated with #pragma omp simd
1407 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1408 // vector length information is not provided, vectorization is not considered
1409 // explicit. Interleave hints are not allowed either. These limitations will be
1410 // relaxed in the future.
1411 // Please, note that we are currently forced to abuse the pragma 'clang
1412 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1413 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1414 // provides *explicit vectorization hints* (LV can bypass legal checks and
1415 // assume that vectorization is legal). However, both hints are implemented
1416 // using the same metadata (llvm.loop.vectorize, processed by
1417 // LoopVectorizeHints). This will be fixed in the future when the native IR
1418 // representation for pragma 'omp simd' is introduced.
1419 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1420 OptimizationRemarkEmitter *ORE) {
1421 assert(!OuterLp->empty() && "This is not an outer loop");
1422 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1424 // Only outer loops with an explicit vectorization hint are supported.
1425 // Unannotated outer loops are ignored.
1426 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1429 Function *Fn = OuterLp->getHeader()->getParent();
1430 if (!Hints.allowVectorization(Fn, OuterLp,
1431 true /*VectorizeOnlyWhenForced*/)) {
1432 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1436 if (Hints.getInterleave() > 1) {
1437 // TODO: Interleave support is future work.
1438 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1440 Hints.emitRemarkWithHints();
1447 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1448 OptimizationRemarkEmitter *ORE,
1449 SmallVectorImpl<Loop *> &V) {
1450 // Collect inner loops and outer loops without irreducible control flow. For
1451 // now, only collect outer loops that have explicit vectorization hints. If we
1452 // are stress testing the VPlan H-CFG construction, we collect the outermost
1453 // loop of every loop nest.
1454 if (L.empty() || VPlanBuildStressTest ||
1455 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1456 LoopBlocksRPO RPOT(&L);
1458 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1460 // TODO: Collect inner loops inside marked outer loops in case
1461 // vectorization fails for the outer loop. Do not invoke
1462 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1463 // already known to be reducible. We can use an inherited attribute for
1468 for (Loop *InnerL : L)
1469 collectSupportedLoops(*InnerL, LI, ORE, V);
1474 /// The LoopVectorize Pass.
1475 struct LoopVectorize : public FunctionPass {
1476 /// Pass identification, replacement for typeid
1479 LoopVectorizePass Impl;
1481 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1482 bool VectorizeOnlyWhenForced = false)
1483 : FunctionPass(ID) {
1484 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1485 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1486 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1489 bool runOnFunction(Function &F) override {
1490 if (skipFunction(F))
1493 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1494 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1495 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1496 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1497 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1498 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1499 auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1500 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1501 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1502 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1503 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1504 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1505 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1507 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1508 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1510 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1514 void getAnalysisUsage(AnalysisUsage &AU) const override {
1515 AU.addRequired<AssumptionCacheTracker>();
1516 AU.addRequired<BlockFrequencyInfoWrapperPass>();
1517 AU.addRequired<DominatorTreeWrapperPass>();
1518 AU.addRequired<LoopInfoWrapperPass>();
1519 AU.addRequired<ScalarEvolutionWrapperPass>();
1520 AU.addRequired<TargetTransformInfoWrapperPass>();
1521 AU.addRequired<AAResultsWrapperPass>();
1522 AU.addRequired<LoopAccessLegacyAnalysis>();
1523 AU.addRequired<DemandedBitsWrapperPass>();
1524 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1526 // We currently do not preserve loopinfo/dominator analyses with outer loop
1527 // vectorization. Until this is addressed, mark these analyses as preserved
1528 // only for non-VPlan-native path.
1529 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1530 if (!EnableVPlanNativePath) {
1531 AU.addPreserved<LoopInfoWrapperPass>();
1532 AU.addPreserved<DominatorTreeWrapperPass>();
1535 AU.addPreserved<BasicAAWrapperPass>();
1536 AU.addPreserved<GlobalsAAWrapperPass>();
1537 AU.addRequired<ProfileSummaryInfoWrapperPass>();
1541 } // end anonymous namespace
1543 //===----------------------------------------------------------------------===//
1544 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1545 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1546 //===----------------------------------------------------------------------===//
1548 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1549 // We need to place the broadcast of invariant variables outside the loop,
1550 // but only if it's proven safe to do so. Else, broadcast will be inside
1551 // vector loop body.
1552 Instruction *Instr = dyn_cast<Instruction>(V);
1553 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1555 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1556 // Place the code for broadcasting invariant variables in the new preheader.
1557 IRBuilder<>::InsertPointGuard Guard(Builder);
1559 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1561 // Broadcast the scalar into all locations in the vector.
1562 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1567 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1568 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1569 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1570 "Expected either an induction phi-node or a truncate of it!");
1571 Value *Start = II.getStartValue();
1573 // Construct the initial value of the vector IV in the vector loop preheader
1574 auto CurrIP = Builder.saveIP();
1575 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1576 if (isa<TruncInst>(EntryVal)) {
1577 assert(Start->getType()->isIntegerTy() &&
1578 "Truncation requires an integer type");
1579 auto *TruncType = cast<IntegerType>(EntryVal->getType());
1580 Step = Builder.CreateTrunc(Step, TruncType);
1581 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1583 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1584 Value *SteppedStart =
1585 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1587 // We create vector phi nodes for both integer and floating-point induction
1588 // variables. Here, we determine the kind of arithmetic we will perform.
1589 Instruction::BinaryOps AddOp;
1590 Instruction::BinaryOps MulOp;
1591 if (Step->getType()->isIntegerTy()) {
1592 AddOp = Instruction::Add;
1593 MulOp = Instruction::Mul;
1595 AddOp = II.getInductionOpcode();
1596 MulOp = Instruction::FMul;
1599 // Multiply the vectorization factor by the step using integer or
1600 // floating-point arithmetic as appropriate.
1601 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1602 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1604 // Create a vector splat to use in the induction update.
1606 // FIXME: If the step is non-constant, we create the vector splat with
1607 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1608 // handle a constant vector splat.
1609 Value *SplatVF = isa<Constant>(Mul)
1610 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1611 : Builder.CreateVectorSplat(VF, Mul);
1612 Builder.restoreIP(CurrIP);
1614 // We may need to add the step a number of times, depending on the unroll
1615 // factor. The last of those goes into the PHI.
1616 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1617 &*LoopVectorBody->getFirstInsertionPt());
1618 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1619 Instruction *LastInduction = VecInd;
1620 for (unsigned Part = 0; Part < UF; ++Part) {
1621 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1623 if (isa<TruncInst>(EntryVal))
1624 addMetadata(LastInduction, EntryVal);
1625 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1627 LastInduction = cast<Instruction>(addFastMathFlag(
1628 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1629 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1632 // Move the last step to the end of the latch block. This ensures consistent
1633 // placement of all induction updates.
1634 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1635 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1636 auto *ICmp = cast<Instruction>(Br->getCondition());
1637 LastInduction->moveBefore(ICmp);
1638 LastInduction->setName("vec.ind.next");
1640 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1641 VecInd->addIncoming(LastInduction, LoopVectorLatch);
1644 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1645 return Cost->isScalarAfterVectorization(I, VF) ||
1646 Cost->isProfitableToScalarize(I, VF);
1649 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1650 if (shouldScalarizeInstruction(IV))
1652 auto isScalarInst = [&](User *U) -> bool {
1653 auto *I = cast<Instruction>(U);
1654 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1656 return llvm::any_of(IV->users(), isScalarInst);
1659 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1660 const InductionDescriptor &ID, const Instruction *EntryVal,
1661 Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1662 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1663 "Expected either an induction phi-node or a truncate of it!");
1665 // This induction variable is not the phi from the original loop but the
1666 // newly-created IV based on the proof that casted Phi is equal to the
1667 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1668 // re-uses the same InductionDescriptor that original IV uses but we don't
1669 // have to do any recording in this case - that is done when original IV is
1671 if (isa<TruncInst>(EntryVal))
1674 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1677 // Only the first Cast instruction in the Casts vector is of interest.
1678 // The rest of the Casts (if exist) have no uses outside the
1679 // induction update chain itself.
1680 Instruction *CastInst = *Casts.begin();
1681 if (Lane < UINT_MAX)
1682 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1684 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1687 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1688 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1689 "Primary induction variable must have an integer type");
1691 auto II = Legal->getInductionVars()->find(IV);
1692 assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1694 auto ID = II->second;
1695 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1697 // The scalar value to broadcast. This will be derived from the canonical
1698 // induction variable.
1699 Value *ScalarIV = nullptr;
1701 // The value from the original loop to which we are mapping the new induction
1703 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1705 // True if we have vectorized the induction variable.
1706 auto VectorizedIV = false;
1708 // Determine if we want a scalar version of the induction variable. This is
1709 // true if the induction variable itself is not widened, or if it has at
1710 // least one user in the loop that is not widened.
1711 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1713 // Generate code for the induction step. Note that induction steps are
1714 // required to be loop-invariant
1715 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1716 "Induction step should be loop invariant");
1717 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1718 Value *Step = nullptr;
1719 if (PSE.getSE()->isSCEVable(IV->getType())) {
1720 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1721 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1722 LoopVectorPreHeader->getTerminator());
1724 Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1727 // Try to create a new independent vector induction variable. If we can't
1728 // create the phi node, we will splat the scalar induction variable in each
1730 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1731 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1732 VectorizedIV = true;
1735 // If we haven't yet vectorized the induction variable, or if we will create
1736 // a scalar one, we need to define the scalar induction variable and step
1737 // values. If we were given a truncation type, truncate the canonical
1738 // induction variable and step. Otherwise, derive these values from the
1739 // induction descriptor.
1740 if (!VectorizedIV || NeedsScalarIV) {
1741 ScalarIV = Induction;
1742 if (IV != OldInduction) {
1743 ScalarIV = IV->getType()->isIntegerTy()
1744 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1745 : Builder.CreateCast(Instruction::SIToFP, Induction,
1747 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1748 ScalarIV->setName("offset.idx");
1751 auto *TruncType = cast<IntegerType>(Trunc->getType());
1752 assert(Step->getType()->isIntegerTy() &&
1753 "Truncation requires an integer step");
1754 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1755 Step = Builder.CreateTrunc(Step, TruncType);
1759 // If we haven't yet vectorized the induction variable, splat the scalar
1760 // induction variable, and build the necessary step vectors.
1761 // TODO: Don't do it unless the vectorized IV is really required.
1762 if (!VectorizedIV) {
1763 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1764 for (unsigned Part = 0; Part < UF; ++Part) {
1766 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1767 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1769 addMetadata(EntryPart, Trunc);
1770 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1774 // If an induction variable is only used for counting loop iterations or
1775 // calculating addresses, it doesn't need to be widened. Create scalar steps
1776 // that can be used by instructions we will later scalarize. Note that the
1777 // addition of the scalar steps will not increase the number of instructions
1778 // in the loop in the common case prior to InstCombine. We will be trading
1779 // one vector extract for each scalar step.
1781 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1784 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1785 Instruction::BinaryOps BinOp) {
1786 // Create and check the types.
1787 assert(Val->getType()->isVectorTy() && "Must be a vector");
1788 int VLen = Val->getType()->getVectorNumElements();
1790 Type *STy = Val->getType()->getScalarType();
1791 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1792 "Induction Step must be an integer or FP");
1793 assert(Step->getType() == STy && "Step has wrong type");
1795 SmallVector<Constant *, 8> Indices;
1797 if (STy->isIntegerTy()) {
1798 // Create a vector of consecutive numbers from zero to VF.
1799 for (int i = 0; i < VLen; ++i)
1800 Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1802 // Add the consecutive indices to the vector value.
1803 Constant *Cv = ConstantVector::get(Indices);
1804 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1805 Step = Builder.CreateVectorSplat(VLen, Step);
1806 assert(Step->getType() == Val->getType() && "Invalid step vec");
1807 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1808 // which can be found from the original scalar operations.
1809 Step = Builder.CreateMul(Cv, Step);
1810 return Builder.CreateAdd(Val, Step, "induction");
1813 // Floating point induction.
1814 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1815 "Binary Opcode should be specified for FP induction");
1816 // Create a vector of consecutive numbers from zero to VF.
1817 for (int i = 0; i < VLen; ++i)
1818 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1820 // Add the consecutive indices to the vector value.
1821 Constant *Cv = ConstantVector::get(Indices);
1823 Step = Builder.CreateVectorSplat(VLen, Step);
1825 // Floating point operations had to be 'fast' to enable the induction.
1826 FastMathFlags Flags;
1829 Value *MulOp = Builder.CreateFMul(Cv, Step);
1830 if (isa<Instruction>(MulOp))
1831 // Have to check, MulOp may be a constant
1832 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1834 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1835 if (isa<Instruction>(BOp))
1836 cast<Instruction>(BOp)->setFastMathFlags(Flags);
1840 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1841 Instruction *EntryVal,
1842 const InductionDescriptor &ID) {
1843 // We shouldn't have to build scalar steps if we aren't vectorizing.
1844 assert(VF > 1 && "VF should be greater than one");
1846 // Get the value type and ensure it and the step have the same integer type.
1847 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1848 assert(ScalarIVTy == Step->getType() &&
1849 "Val and Step should have the same type");
1851 // We build scalar steps for both integer and floating-point induction
1852 // variables. Here, we determine the kind of arithmetic we will perform.
1853 Instruction::BinaryOps AddOp;
1854 Instruction::BinaryOps MulOp;
1855 if (ScalarIVTy->isIntegerTy()) {
1856 AddOp = Instruction::Add;
1857 MulOp = Instruction::Mul;
1859 AddOp = ID.getInductionOpcode();
1860 MulOp = Instruction::FMul;
1863 // Determine the number of scalars we need to generate for each unroll
1864 // iteration. If EntryVal is uniform, we only need to generate the first
1865 // lane. Otherwise, we generate all VF values.
1867 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1869 // Compute the scalar steps and save the results in VectorLoopValueMap.
1870 for (unsigned Part = 0; Part < UF; ++Part) {
1871 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1872 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1873 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1874 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1875 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1876 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1881 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1882 assert(V != Induction && "The new induction variable should not be used.");
1883 assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1884 assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1886 // If we have a stride that is replaced by one, do it here. Defer this for
1887 // the VPlan-native path until we start running Legal checks in that path.
1888 if (!EnableVPlanNativePath && Legal->hasStride(V))
1889 V = ConstantInt::get(V->getType(), 1);
1891 // If we have a vector mapped to this value, return it.
1892 if (VectorLoopValueMap.hasVectorValue(V, Part))
1893 return VectorLoopValueMap.getVectorValue(V, Part);
1895 // If the value has not been vectorized, check if it has been scalarized
1896 // instead. If it has been scalarized, and we actually need the value in
1897 // vector form, we will construct the vector values on demand.
1898 if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1899 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1901 // If we've scalarized a value, that value should be an instruction.
1902 auto *I = cast<Instruction>(V);
1904 // If we aren't vectorizing, we can just copy the scalar map values over to
1907 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1911 // Get the last scalar instruction we generated for V and Part. If the value
1912 // is known to be uniform after vectorization, this corresponds to lane zero
1913 // of the Part unroll iteration. Otherwise, the last instruction is the one
1914 // we created for the last vector lane of the Part unroll iteration.
1915 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1916 auto *LastInst = cast<Instruction>(
1917 VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1919 // Set the insert point after the last scalarized instruction. This ensures
1920 // the insertelement sequence will directly follow the scalar definitions.
1921 auto OldIP = Builder.saveIP();
1922 auto NewIP = std::next(BasicBlock::iterator(LastInst));
1923 Builder.SetInsertPoint(&*NewIP);
1925 // However, if we are vectorizing, we need to construct the vector values.
1926 // If the value is known to be uniform after vectorization, we can just
1927 // broadcast the scalar value corresponding to lane zero for each unroll
1928 // iteration. Otherwise, we construct the vector values using insertelement
1929 // instructions. Since the resulting vectors are stored in
1930 // VectorLoopValueMap, we will only generate the insertelements once.
1931 Value *VectorValue = nullptr;
1932 if (Cost->isUniformAfterVectorization(I, VF)) {
1933 VectorValue = getBroadcastInstrs(ScalarValue);
1934 VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1936 // Initialize packing with insertelements to start from undef.
1937 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
1938 VectorLoopValueMap.setVectorValue(V, Part, Undef);
1939 for (unsigned Lane = 0; Lane < VF; ++Lane)
1940 packScalarIntoVectorValue(V, {Part, Lane});
1941 VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1943 Builder.restoreIP(OldIP);
1947 // If this scalar is unknown, assume that it is a constant or that it is
1948 // loop invariant. Broadcast V and save the value for future uses.
1949 Value *B = getBroadcastInstrs(V);
1950 VectorLoopValueMap.setVectorValue(V, Part, B);
1955 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
1956 const VPIteration &Instance) {
1957 // If the value is not an instruction contained in the loop, it should
1958 // already be scalar.
1959 if (OrigLoop->isLoopInvariant(V))
1962 assert(Instance.Lane > 0
1963 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1964 : true && "Uniform values only have lane zero");
1966 // If the value from the original loop has not been vectorized, it is
1967 // represented by UF x VF scalar values in the new loop. Return the requested
1969 if (VectorLoopValueMap.hasScalarValue(V, Instance))
1970 return VectorLoopValueMap.getScalarValue(V, Instance);
1972 // If the value has not been scalarized, get its entry in VectorLoopValueMap
1973 // for the given unroll part. If this entry is not a vector type (i.e., the
1974 // vectorization factor is one), there is no need to generate an
1975 // extractelement instruction.
1976 auto *U = getOrCreateVectorValue(V, Instance.Part);
1977 if (!U->getType()->isVectorTy()) {
1978 assert(VF == 1 && "Value not scalarized has non-vector type");
1982 // Otherwise, the value from the original loop has been vectorized and is
1983 // represented by UF vector values. Extract and return the requested scalar
1984 // value from the appropriate vector lane.
1985 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1988 void InnerLoopVectorizer::packScalarIntoVectorValue(
1989 Value *V, const VPIteration &Instance) {
1990 assert(V != Induction && "The new induction variable should not be used.");
1991 assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1992 assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1994 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1995 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1996 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1997 Builder.getInt32(Instance.Lane));
1998 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2001 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2002 assert(Vec->getType()->isVectorTy() && "Invalid type");
2003 SmallVector<Constant *, 8> ShuffleMask;
2004 for (unsigned i = 0; i < VF; ++i)
2005 ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2007 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2008 ConstantVector::get(ShuffleMask),
2012 // Return whether we allow using masked interleave-groups (for dealing with
2013 // strided loads/stores that reside in predicated blocks, or for dealing
2015 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2016 // If an override option has been passed in for interleaved accesses, use it.
2017 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2018 return EnableMaskedInterleavedMemAccesses;
2020 return TTI.enableMaskedInterleavedAccessVectorization();
2023 // Try to vectorize the interleave group that \p Instr belongs to.
2025 // E.g. Translate following interleaved load group (factor = 3):
2026 // for (i = 0; i < N; i+=3) {
2027 // R = Pic[i]; // Member of index 0
2028 // G = Pic[i+1]; // Member of index 1
2029 // B = Pic[i+2]; // Member of index 2
2030 // ... // do something to R, G, B
2033 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2034 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2035 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2036 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2038 // Or translate following interleaved store group (factor = 3):
2039 // for (i = 0; i < N; i+=3) {
2040 // ... do something to R, G, B
2041 // Pic[i] = R; // Member of index 0
2042 // Pic[i+1] = G; // Member of index 1
2043 // Pic[i+2] = B; // Member of index 2
2046 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2047 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2048 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2049 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2050 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2051 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2052 VectorParts *BlockInMask) {
2053 const InterleaveGroup<Instruction> *Group =
2054 Cost->getInterleavedAccessGroup(Instr);
2055 assert(Group && "Fail to get an interleaved access group.");
2057 // Skip if current instruction is not the insert position.
2058 if (Instr != Group->getInsertPos())
2061 const DataLayout &DL = Instr->getModule()->getDataLayout();
2062 Value *Ptr = getLoadStorePointerOperand(Instr);
2064 // Prepare for the vector type of the interleaved load/store.
2065 Type *ScalarTy = getMemInstValueType(Instr);
2066 unsigned InterleaveFactor = Group->getFactor();
2067 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2068 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2070 // Prepare for the new pointers.
2071 setDebugLocFromInst(Builder, Ptr);
2072 SmallVector<Value *, 2> NewPtrs;
2073 unsigned Index = Group->getIndex(Instr);
2076 bool IsMaskForCondRequired = BlockInMask;
2077 if (IsMaskForCondRequired) {
2078 Mask = *BlockInMask;
2079 // TODO: extend the masked interleaved-group support to reversed access.
2080 assert(!Group->isReverse() && "Reversed masked interleave-group "
2084 // If the group is reverse, adjust the index to refer to the last vector lane
2085 // instead of the first. We adjust the index from the first vector lane,
2086 // rather than directly getting the pointer for lane VF - 1, because the
2087 // pointer operand of the interleaved access is supposed to be uniform. For
2088 // uniform instructions, we're only required to generate a value for the
2089 // first vector lane in each unroll iteration.
2090 if (Group->isReverse())
2091 Index += (VF - 1) * Group->getFactor();
2093 bool InBounds = false;
2094 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2095 InBounds = gep->isInBounds();
2097 for (unsigned Part = 0; Part < UF; Part++) {
2098 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2100 // Notice current instruction could be any index. Need to adjust the address
2101 // to the member of index 0.
2103 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2104 // b = A[i]; // Member of index 0
2105 // Current pointer is pointed to A[i+1], adjust it to A[i].
2107 // E.g. A[i+1] = a; // Member of index 1
2108 // A[i] = b; // Member of index 0
2109 // A[i+2] = c; // Member of index 2 (Current instruction)
2110 // Current pointer is pointed to A[i+2], adjust it to A[i].
2111 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2113 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2115 // Cast to the vector pointer type.
2116 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2119 setDebugLocFromInst(Builder, Instr);
2120 Value *UndefVec = UndefValue::get(VecTy);
2122 Value *MaskForGaps = nullptr;
2123 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2124 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2125 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2128 // Vectorize the interleaved load group.
2129 if (isa<LoadInst>(Instr)) {
2130 // For each unroll part, create a wide load for the group.
2131 SmallVector<Value *, 2> NewLoads;
2132 for (unsigned Part = 0; Part < UF; Part++) {
2133 Instruction *NewLoad;
2134 if (IsMaskForCondRequired || MaskForGaps) {
2135 assert(useMaskedInterleavedAccesses(*TTI) &&
2136 "masked interleaved groups are not allowed.");
2137 Value *GroupMask = MaskForGaps;
2138 if (IsMaskForCondRequired) {
2139 auto *Undefs = UndefValue::get(Mask[Part]->getType());
2140 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2141 Value *ShuffledMask = Builder.CreateShuffleVector(
2142 Mask[Part], Undefs, RepMask, "interleaved.mask");
2143 GroupMask = MaskForGaps
2144 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2149 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2150 GroupMask, UndefVec, "wide.masked.vec");
2153 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2154 Group->getAlignment(), "wide.vec");
2155 Group->addMetadata(NewLoad);
2156 NewLoads.push_back(NewLoad);
2159 // For each member in the group, shuffle out the appropriate data from the
2161 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2162 Instruction *Member = Group->getMember(I);
2164 // Skip the gaps in the group.
2168 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2169 for (unsigned Part = 0; Part < UF; Part++) {
2170 Value *StridedVec = Builder.CreateShuffleVector(
2171 NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2173 // If this member has different type, cast the result type.
2174 if (Member->getType() != ScalarTy) {
2175 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2176 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2179 if (Group->isReverse())
2180 StridedVec = reverseVector(StridedVec);
2182 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2188 // The sub vector type for current instruction.
2189 VectorType *SubVT = VectorType::get(ScalarTy, VF);
2191 // Vectorize the interleaved store group.
2192 for (unsigned Part = 0; Part < UF; Part++) {
2193 // Collect the stored vector from each member.
2194 SmallVector<Value *, 4> StoredVecs;
2195 for (unsigned i = 0; i < InterleaveFactor; i++) {
2196 // Interleaved store group doesn't allow a gap, so each index has a member
2197 Instruction *Member = Group->getMember(i);
2198 assert(Member && "Fail to get a member from an interleaved store group");
2200 Value *StoredVec = getOrCreateVectorValue(
2201 cast<StoreInst>(Member)->getValueOperand(), Part);
2202 if (Group->isReverse())
2203 StoredVec = reverseVector(StoredVec);
2205 // If this member has different type, cast it to a unified type.
2207 if (StoredVec->getType() != SubVT)
2208 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2210 StoredVecs.push_back(StoredVec);
2213 // Concatenate all vectors into a wide vector.
2214 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2216 // Interleave the elements in the wide vector.
2217 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2218 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2221 Instruction *NewStoreInstr;
2222 if (IsMaskForCondRequired) {
2223 auto *Undefs = UndefValue::get(Mask[Part]->getType());
2224 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2225 Value *ShuffledMask = Builder.CreateShuffleVector(
2226 Mask[Part], Undefs, RepMask, "interleaved.mask");
2227 NewStoreInstr = Builder.CreateMaskedStore(
2228 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2231 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2232 Group->getAlignment());
2234 Group->addMetadata(NewStoreInstr);
2238 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2239 VectorParts *BlockInMask) {
2240 // Attempt to issue a wide load.
2241 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2242 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2244 assert((LI || SI) && "Invalid Load/Store instruction");
2246 LoopVectorizationCostModel::InstWidening Decision =
2247 Cost->getWideningDecision(Instr, VF);
2248 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2249 "CM decision should be taken at this point");
2250 if (Decision == LoopVectorizationCostModel::CM_Interleave)
2251 return vectorizeInterleaveGroup(Instr);
2253 Type *ScalarDataTy = getMemInstValueType(Instr);
2254 Type *DataTy = VectorType::get(ScalarDataTy, VF);
2255 Value *Ptr = getLoadStorePointerOperand(Instr);
2256 unsigned Alignment = getLoadStoreAlignment(Instr);
2257 // An alignment of 0 means target abi alignment. We need to use the scalar's
2258 // target abi alignment in such a case.
2259 const DataLayout &DL = Instr->getModule()->getDataLayout();
2261 Alignment = DL.getABITypeAlignment(ScalarDataTy);
2262 unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2264 // Determine if the pointer operand of the access is either consecutive or
2265 // reverse consecutive.
2266 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2267 bool ConsecutiveStride =
2268 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2269 bool CreateGatherScatter =
2270 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2272 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2273 // gather/scatter. Otherwise Decision should have been to Scalarize.
2274 assert((ConsecutiveStride || CreateGatherScatter) &&
2275 "The instruction should be scalarized");
2277 // Handle consecutive loads/stores.
2278 if (ConsecutiveStride)
2279 Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2282 bool isMaskRequired = BlockInMask;
2284 Mask = *BlockInMask;
2286 bool InBounds = false;
2287 if (auto *gep = dyn_cast<GetElementPtrInst>(
2288 getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2289 InBounds = gep->isInBounds();
2291 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2292 // Calculate the pointer for the specific unroll-part.
2293 GetElementPtrInst *PartPtr = nullptr;
2296 // If the address is consecutive but reversed, then the
2297 // wide store needs to start at the last vector element.
2298 PartPtr = cast<GetElementPtrInst>(
2299 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2300 PartPtr->setIsInBounds(InBounds);
2301 PartPtr = cast<GetElementPtrInst>(
2302 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2303 PartPtr->setIsInBounds(InBounds);
2304 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2305 Mask[Part] = reverseVector(Mask[Part]);
2307 PartPtr = cast<GetElementPtrInst>(
2308 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2309 PartPtr->setIsInBounds(InBounds);
2312 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2317 setDebugLocFromInst(Builder, SI);
2319 for (unsigned Part = 0; Part < UF; ++Part) {
2320 Instruction *NewSI = nullptr;
2321 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2322 if (CreateGatherScatter) {
2323 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2324 Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2325 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2329 // If we store to reverse consecutive memory locations, then we need
2330 // to reverse the order of elements in the stored value.
2331 StoredVal = reverseVector(StoredVal);
2332 // We don't want to update the value in the map as it might be used in
2333 // another expression. So don't call resetVectorValue(StoredVal).
2335 auto *VecPtr = CreateVecPtr(Part, Ptr);
2337 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2340 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2342 addMetadata(NewSI, SI);
2348 assert(LI && "Must have a load instruction");
2349 setDebugLocFromInst(Builder, LI);
2350 for (unsigned Part = 0; Part < UF; ++Part) {
2352 if (CreateGatherScatter) {
2353 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2354 Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2355 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2356 nullptr, "wide.masked.gather");
2357 addMetadata(NewLI, LI);
2359 auto *VecPtr = CreateVecPtr(Part, Ptr);
2361 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2362 UndefValue::get(DataTy),
2363 "wide.masked.load");
2366 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2368 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2369 addMetadata(NewLI, LI);
2371 NewLI = reverseVector(NewLI);
2373 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2377 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2378 const VPIteration &Instance,
2379 bool IfPredicateInstr) {
2380 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2382 setDebugLocFromInst(Builder, Instr);
2384 // Does this instruction return a value ?
2385 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2387 Instruction *Cloned = Instr->clone();
2389 Cloned->setName(Instr->getName() + ".cloned");
2391 // Replace the operands of the cloned instructions with their scalar
2392 // equivalents in the new loop.
2393 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2394 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2395 Cloned->setOperand(op, NewOp);
2397 addNewMetadata(Cloned, Instr);
2399 // Place the cloned scalar in the new loop.
2400 Builder.Insert(Cloned);
2402 // Add the cloned scalar to the scalar map entry.
2403 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2405 // If we just cloned a new assumption, add it the assumption cache.
2406 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2407 if (II->getIntrinsicID() == Intrinsic::assume)
2408 AC->registerAssumption(II);
2411 if (IfPredicateInstr)
2412 PredicatedInstructions.push_back(Cloned);
2415 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2416 Value *End, Value *Step,
2418 BasicBlock *Header = L->getHeader();
2419 BasicBlock *Latch = L->getLoopLatch();
2420 // As we're just creating this loop, it's possible no latch exists
2421 // yet. If so, use the header as this will be a single block loop.
2425 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2426 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2427 setDebugLocFromInst(Builder, OldInst);
2428 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2430 Builder.SetInsertPoint(Latch->getTerminator());
2431 setDebugLocFromInst(Builder, OldInst);
2433 // Create i+1 and fill the PHINode.
2434 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2435 Induction->addIncoming(Start, L->getLoopPreheader());
2436 Induction->addIncoming(Next, Latch);
2437 // Create the compare.
2438 Value *ICmp = Builder.CreateICmpEQ(Next, End);
2439 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2441 // Now we have two terminators. Remove the old one from the block.
2442 Latch->getTerminator()->eraseFromParent();
2447 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2451 assert(L && "Create Trip Count for null loop.");
2452 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2453 // Find the loop boundaries.
2454 ScalarEvolution *SE = PSE.getSE();
2455 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2456 assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2457 "Invalid loop count");
2459 Type *IdxTy = Legal->getWidestInductionType();
2460 assert(IdxTy && "No type for induction");
2462 // The exit count might have the type of i64 while the phi is i32. This can
2463 // happen if we have an induction variable that is sign extended before the
2464 // compare. The only way that we get a backedge taken count is that the
2465 // induction variable was signed and as such will not overflow. In such a case
2466 // truncation is legal.
2467 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2468 IdxTy->getPrimitiveSizeInBits())
2469 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2470 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2472 // Get the total trip count from the count by adding 1.
2473 const SCEV *ExitCount = SE->getAddExpr(
2474 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2476 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2478 // Expand the trip count and place the new instructions in the preheader.
2479 // Notice that the pre-header does not change, only the loop body.
2480 SCEVExpander Exp(*SE, DL, "induction");
2482 // Count holds the overall loop count (N).
2483 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2484 L->getLoopPreheader()->getTerminator());
2486 if (TripCount->getType()->isPointerTy())
2488 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2489 L->getLoopPreheader()->getTerminator());
2494 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2495 if (VectorTripCount)
2496 return VectorTripCount;
2498 Value *TC = getOrCreateTripCount(L);
2499 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2501 Type *Ty = TC->getType();
2502 Constant *Step = ConstantInt::get(Ty, VF * UF);
2504 // If the tail is to be folded by masking, round the number of iterations N
2505 // up to a multiple of Step instead of rounding down. This is done by first
2506 // adding Step-1 and then rounding down. Note that it's ok if this addition
2507 // overflows: the vector induction variable will eventually wrap to zero given
2508 // that it starts at zero and its Step is a power of two; the loop will then
2509 // exit, with the last early-exit vector comparison also producing all-true.
2510 if (Cost->foldTailByMasking()) {
2511 assert(isPowerOf2_32(VF * UF) &&
2512 "VF*UF must be a power of 2 when folding tail by masking");
2513 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2516 // Now we need to generate the expression for the part of the loop that the
2517 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2518 // iterations are not required for correctness, or N - Step, otherwise. Step
2519 // is equal to the vectorization factor (number of SIMD elements) times the
2520 // unroll factor (number of SIMD instructions).
2521 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2523 // If there is a non-reversed interleaved group that may speculatively access
2524 // memory out-of-bounds, we need to ensure that there will be at least one
2525 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2526 // the trip count, we set the remainder to be equal to the step. If the step
2527 // does not evenly divide the trip count, no adjustment is necessary since
2528 // there will already be scalar iterations. Note that the minimum iterations
2529 // check ensures that N >= Step.
2530 if (VF > 1 && Cost->requiresScalarEpilogue()) {
2531 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2532 R = Builder.CreateSelect(IsZero, Step, R);
2535 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2537 return VectorTripCount;
2540 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2541 const DataLayout &DL) {
2542 // Verify that V is a vector type with same number of elements as DstVTy.
2543 unsigned VF = DstVTy->getNumElements();
2544 VectorType *SrcVecTy = cast<VectorType>(V->getType());
2545 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2546 Type *SrcElemTy = SrcVecTy->getElementType();
2547 Type *DstElemTy = DstVTy->getElementType();
2548 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2549 "Vector elements must have same size");
2551 // Do a direct cast if element types are castable.
2552 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2553 return Builder.CreateBitOrPointerCast(V, DstVTy);
2555 // V cannot be directly casted to desired vector type.
2556 // May happen when V is a floating point vector but DstVTy is a vector of
2557 // pointers or vice-versa. Handle this using a two-step bitcast using an
2558 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2559 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2560 "Only one type should be a pointer type");
2561 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2562 "Only one type should be a floating point type");
2564 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2565 VectorType *VecIntTy = VectorType::get(IntTy, VF);
2566 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2567 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2570 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2571 BasicBlock *Bypass) {
2572 Value *Count = getOrCreateTripCount(L);
2573 BasicBlock *BB = L->getLoopPreheader();
2574 IRBuilder<> Builder(BB->getTerminator());
2576 // Generate code to check if the loop's trip count is less than VF * UF, or
2577 // equal to it in case a scalar epilogue is required; this implies that the
2578 // vector trip count is zero. This check also covers the case where adding one
2579 // to the backedge-taken count overflowed leading to an incorrect trip count
2580 // of zero. In this case we will also jump to the scalar loop.
2581 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2582 : ICmpInst::ICMP_ULT;
2584 // If tail is to be folded, vector loop takes care of all iterations.
2585 Value *CheckMinIters = Builder.getFalse();
2586 if (!Cost->foldTailByMasking())
2587 CheckMinIters = Builder.CreateICmp(
2588 P, Count, ConstantInt::get(Count->getType(), VF * UF),
2591 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2592 // Update dominator tree immediately if the generated block is a
2593 // LoopBypassBlock because SCEV expansions to generate loop bypass
2594 // checks may query it before the current function is finished.
2595 DT->addNewBlock(NewBB, BB);
2596 if (L->getParentLoop())
2597 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2598 ReplaceInstWithInst(BB->getTerminator(),
2599 BranchInst::Create(Bypass, NewBB, CheckMinIters));
2600 LoopBypassBlocks.push_back(BB);
2603 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2604 BasicBlock *BB = L->getLoopPreheader();
2606 // Generate the code to check that the SCEV assumptions that we made.
2607 // We want the new basic block to start at the first instruction in a
2608 // sequence of instructions that form a check.
2609 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2612 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2614 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2618 assert(!Cost->foldTailByMasking() &&
2619 "Cannot SCEV check stride or overflow when folding tail");
2620 // Create a new block containing the stride check.
2621 BB->setName("vector.scevcheck");
2622 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2623 // Update dominator tree immediately if the generated block is a
2624 // LoopBypassBlock because SCEV expansions to generate loop bypass
2625 // checks may query it before the current function is finished.
2626 DT->addNewBlock(NewBB, BB);
2627 if (L->getParentLoop())
2628 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2629 ReplaceInstWithInst(BB->getTerminator(),
2630 BranchInst::Create(Bypass, NewBB, SCEVCheck));
2631 LoopBypassBlocks.push_back(BB);
2632 AddedSafetyChecks = true;
2635 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2636 // VPlan-native path does not do any analysis for runtime checks currently.
2637 if (EnableVPlanNativePath)
2640 BasicBlock *BB = L->getLoopPreheader();
2642 // Generate the code that checks in runtime if arrays overlap. We put the
2643 // checks into a separate block to make the more common case of few elements
2645 Instruction *FirstCheckInst;
2646 Instruction *MemRuntimeCheck;
2647 std::tie(FirstCheckInst, MemRuntimeCheck) =
2648 Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2649 if (!MemRuntimeCheck)
2652 assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2653 // Create a new block containing the memory check.
2654 BB->setName("vector.memcheck");
2655 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2656 // Update dominator tree immediately if the generated block is a
2657 // LoopBypassBlock because SCEV expansions to generate loop bypass
2658 // checks may query it before the current function is finished.
2659 DT->addNewBlock(NewBB, BB);
2660 if (L->getParentLoop())
2661 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2662 ReplaceInstWithInst(BB->getTerminator(),
2663 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2664 LoopBypassBlocks.push_back(BB);
2665 AddedSafetyChecks = true;
2667 // We currently don't use LoopVersioning for the actual loop cloning but we
2668 // still use it to add the noalias metadata.
2669 LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2671 LVer->prepareNoAliasMetadata();
2674 Value *InnerLoopVectorizer::emitTransformedIndex(
2675 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2676 const InductionDescriptor &ID) const {
2678 SCEVExpander Exp(*SE, DL, "induction");
2679 auto Step = ID.getStep();
2680 auto StartValue = ID.getStartValue();
2681 assert(Index->getType() == Step->getType() &&
2682 "Index type does not match StepValue type");
2684 // Note: the IR at this point is broken. We cannot use SE to create any new
2685 // SCEV and then expand it, hoping that SCEV's simplification will give us
2686 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2687 // lead to various SCEV crashes. So all we can do is to use builder and rely
2688 // on InstCombine for future simplifications. Here we handle some trivial
2690 auto CreateAdd = [&B](Value *X, Value *Y) {
2691 assert(X->getType() == Y->getType() && "Types don't match!");
2692 if (auto *CX = dyn_cast<ConstantInt>(X))
2695 if (auto *CY = dyn_cast<ConstantInt>(Y))
2698 return B.CreateAdd(X, Y);
2701 auto CreateMul = [&B](Value *X, Value *Y) {
2702 assert(X->getType() == Y->getType() && "Types don't match!");
2703 if (auto *CX = dyn_cast<ConstantInt>(X))
2706 if (auto *CY = dyn_cast<ConstantInt>(Y))
2709 return B.CreateMul(X, Y);
2712 switch (ID.getKind()) {
2713 case InductionDescriptor::IK_IntInduction: {
2714 assert(Index->getType() == StartValue->getType() &&
2715 "Index type does not match StartValue type");
2716 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2717 return B.CreateSub(StartValue, Index);
2718 auto *Offset = CreateMul(
2719 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2720 return CreateAdd(StartValue, Offset);
2722 case InductionDescriptor::IK_PtrInduction: {
2723 assert(isa<SCEVConstant>(Step) &&
2724 "Expected constant step for pointer induction");
2726 StartValue->getType()->getPointerElementType(), StartValue,
2727 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2728 &*B.GetInsertPoint())));
2730 case InductionDescriptor::IK_FpInduction: {
2731 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2732 auto InductionBinOp = ID.getInductionBinOp();
2733 assert(InductionBinOp &&
2734 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2735 InductionBinOp->getOpcode() == Instruction::FSub) &&
2736 "Original bin op should be defined for FP induction");
2738 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2740 // Floating point operations had to be 'fast' to enable the induction.
2741 FastMathFlags Flags;
2744 Value *MulExp = B.CreateFMul(StepValue, Index);
2745 if (isa<Instruction>(MulExp))
2746 // We have to check, the MulExp may be a constant.
2747 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2749 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2751 if (isa<Instruction>(BOp))
2752 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2756 case InductionDescriptor::IK_NoInduction:
2759 llvm_unreachable("invalid enum");
2762 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2764 In this function we generate a new loop. The new loop will contain
2765 the vectorized instructions while the old loop will continue to run the
2768 [ ] <-- loop iteration number check.
2771 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2774 || [ ] <-- vector pre header.
2778 | [ ]_| <-- vector loop.
2781 | -[ ] <--- middle-block.
2784 -|- >[ ] <--- new preheader.
2788 | [ ]_| <-- old scalar loop to handle remainder.
2791 >[ ] <-- exit block.
2795 BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2796 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2797 BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2798 MDNode *OrigLoopID = OrigLoop->getLoopID();
2799 assert(VectorPH && "Invalid loop structure");
2800 assert(ExitBlock && "Must have an exit block");
2802 // Some loops have a single integer induction variable, while other loops
2803 // don't. One example is c++ iterators that often have multiple pointer
2804 // induction variables. In the code below we also support a case where we
2805 // don't have a single induction variable.
2807 // We try to obtain an induction variable from the original loop as hard
2808 // as possible. However if we don't find one that:
2810 // - counts from zero, stepping by one
2811 // - is the size of the widest induction variable type
2812 // then we create a new one.
2813 OldInduction = Legal->getPrimaryInduction();
2814 Type *IdxTy = Legal->getWidestInductionType();
2816 // Split the single block loop into the two loop structure described above.
2817 BasicBlock *VecBody =
2818 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2819 BasicBlock *MiddleBlock =
2820 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2821 BasicBlock *ScalarPH =
2822 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2824 // Create and register the new vector loop.
2825 Loop *Lp = LI->AllocateLoop();
2826 Loop *ParentLoop = OrigLoop->getParentLoop();
2828 // Insert the new loop into the loop nest and register the new basic blocks
2829 // before calling any utilities such as SCEV that require valid LoopInfo.
2831 ParentLoop->addChildLoop(Lp);
2832 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2833 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2835 LI->addTopLevelLoop(Lp);
2837 Lp->addBasicBlockToLoop(VecBody, *LI);
2839 // Find the loop boundaries.
2840 Value *Count = getOrCreateTripCount(Lp);
2842 Value *StartIdx = ConstantInt::get(IdxTy, 0);
2844 // Now, compare the new count to zero. If it is zero skip the vector loop and
2845 // jump to the scalar loop. This check also covers the case where the
2846 // backedge-taken count is uint##_max: adding one to it will overflow leading
2847 // to an incorrect trip count of zero. In this (rare) case we will also jump
2848 // to the scalar loop.
2849 emitMinimumIterationCountCheck(Lp, ScalarPH);
2851 // Generate the code to check any assumptions that we've made for SCEV
2853 emitSCEVChecks(Lp, ScalarPH);
2855 // Generate the code that checks in runtime if arrays overlap. We put the
2856 // checks into a separate block to make the more common case of few elements
2858 emitMemRuntimeChecks(Lp, ScalarPH);
2860 // Generate the induction variable.
2861 // The loop step is equal to the vectorization factor (num of SIMD elements)
2862 // times the unroll factor (num of SIMD instructions).
2863 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2864 Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2866 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2867 getDebugLocFromInstOrOperands(OldInduction));
2869 // We are going to resume the execution of the scalar loop.
2870 // Go over all of the induction variables that we found and fix the
2871 // PHIs that are left in the scalar version of the loop.
2872 // The starting values of PHI nodes depend on the counter of the last
2873 // iteration in the vectorized loop.
2874 // If we come from a bypass edge then we need to start from the original
2877 // This variable saves the new starting index for the scalar loop. It is used
2878 // to test if there are any tail iterations left once the vector loop has
2880 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2881 for (auto &InductionEntry : *List) {
2882 PHINode *OrigPhi = InductionEntry.first;
2883 InductionDescriptor II = InductionEntry.second;
2885 // Create phi nodes to merge from the backedge-taken check block.
2886 PHINode *BCResumeVal = PHINode::Create(
2887 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2888 // Copy original phi DL over to the new one.
2889 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2890 Value *&EndValue = IVEndValues[OrigPhi];
2891 if (OrigPhi == OldInduction) {
2892 // We know what the end value is.
2893 EndValue = CountRoundDown;
2895 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2896 Type *StepType = II.getStep()->getType();
2897 Instruction::CastOps CastOp =
2898 CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2899 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2900 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2901 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2902 EndValue->setName("ind.end");
2905 // The new PHI merges the original incoming value, in case of a bypass,
2906 // or the value at the end of the vectorized loop.
2907 BCResumeVal->addIncoming(EndValue, MiddleBlock);
2909 // Fix the scalar body counter (PHI node).
2910 // The old induction's phi node in the scalar body needs the truncated
2912 for (BasicBlock *BB : LoopBypassBlocks)
2913 BCResumeVal->addIncoming(II.getStartValue(), BB);
2914 OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2917 // We need the OrigLoop (scalar loop part) latch terminator to help
2918 // produce correct debug info for the middle block BB instructions.
2919 // The legality check stage guarantees that the loop will have a single
2921 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
2922 "Scalar loop latch terminator isn't a branch");
2923 BranchInst *ScalarLatchBr =
2924 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
2926 // Add a check in the middle block to see if we have completed
2927 // all of the iterations in the first vector loop.
2928 // If (N - N%VF) == N, then we *don't* need to run the remainder.
2929 // If tail is to be folded, we know we don't need to run the remainder.
2930 Value *CmpN = Builder.getTrue();
2931 if (!Cost->foldTailByMasking()) {
2933 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2934 CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2936 // Here we use the same DebugLoc as the scalar loop latch branch instead
2937 // of the corresponding compare because they may have ended up with
2938 // different line numbers and we want to avoid awkward line stepping while
2939 // debugging. Eg. if the compare has got a line number inside the loop.
2940 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
2943 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
2944 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
2945 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
2947 // Get ready to start creating new instructions into the vectorized body.
2948 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
2951 LoopVectorPreHeader = Lp->getLoopPreheader();
2952 LoopScalarPreHeader = ScalarPH;
2953 LoopMiddleBlock = MiddleBlock;
2954 LoopExitBlock = ExitBlock;
2955 LoopVectorBody = VecBody;
2956 LoopScalarBody = OldBasicBlock;
2958 Optional<MDNode *> VectorizedLoopID =
2959 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
2960 LLVMLoopVectorizeFollowupVectorized});
2961 if (VectorizedLoopID.hasValue()) {
2962 Lp->setLoopID(VectorizedLoopID.getValue());
2964 // Do not setAlreadyVectorized if loop attributes have been defined
2966 return LoopVectorPreHeader;
2969 // Keep all loop hints from the original loop on the vector loop (we'll
2970 // replace the vectorizer-specific hints below).
2971 if (MDNode *LID = OrigLoop->getLoopID())
2974 LoopVectorizeHints Hints(Lp, true, *ORE);
2975 Hints.setAlreadyVectorized();
2977 return LoopVectorPreHeader;
2980 // Fix up external users of the induction variable. At this point, we are
2981 // in LCSSA form, with all external PHIs that use the IV having one input value,
2982 // coming from the remainder loop. We need those PHIs to also have a correct
2983 // value for the IV when arriving directly from the middle block.
2984 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2985 const InductionDescriptor &II,
2986 Value *CountRoundDown, Value *EndValue,
2987 BasicBlock *MiddleBlock) {
2988 // There are two kinds of external IV usages - those that use the value
2989 // computed in the last iteration (the PHI) and those that use the penultimate
2990 // value (the value that feeds into the phi from the loop latch).
2991 // We allow both, but they, obviously, have different values.
2993 assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2995 DenseMap<Value *, Value *> MissingVals;
2997 // An external user of the last iteration's value should see the value that
2998 // the remainder loop uses to initialize its own IV.
2999 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3000 for (User *U : PostInc->users()) {
3001 Instruction *UI = cast<Instruction>(U);
3002 if (!OrigLoop->contains(UI)) {
3003 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3004 MissingVals[UI] = EndValue;
3008 // An external user of the penultimate value need to see EndValue - Step.
3009 // The simplest way to get this is to recompute it from the constituent SCEVs,
3010 // that is Start + (Step * (CRD - 1)).
3011 for (User *U : OrigPhi->users()) {
3012 auto *UI = cast<Instruction>(U);
3013 if (!OrigLoop->contains(UI)) {
3014 const DataLayout &DL =
3015 OrigLoop->getHeader()->getModule()->getDataLayout();
3016 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3018 IRBuilder<> B(MiddleBlock->getTerminator());
3019 Value *CountMinusOne = B.CreateSub(
3020 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3022 !II.getStep()->getType()->isIntegerTy()
3023 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3024 II.getStep()->getType())
3025 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3026 CMO->setName("cast.cmo");
3027 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3028 Escape->setName("ind.escape");
3029 MissingVals[UI] = Escape;
3033 for (auto &I : MissingVals) {
3034 PHINode *PHI = cast<PHINode>(I.first);
3035 // One corner case we have to handle is two IVs "chasing" each-other,
3036 // that is %IV2 = phi [...], [ %IV1, %latch ]
3037 // In this case, if IV1 has an external use, we need to avoid adding both
3038 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3039 // don't already have an incoming value for the middle block.
3040 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3041 PHI->addIncoming(I.second, MiddleBlock);
3047 struct CSEDenseMapInfo {
3048 static bool canHandle(const Instruction *I) {
3049 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3050 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3053 static inline Instruction *getEmptyKey() {
3054 return DenseMapInfo<Instruction *>::getEmptyKey();
3057 static inline Instruction *getTombstoneKey() {
3058 return DenseMapInfo<Instruction *>::getTombstoneKey();
3061 static unsigned getHashValue(const Instruction *I) {
3062 assert(canHandle(I) && "Unknown instruction!");
3063 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3064 I->value_op_end()));
3067 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3068 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3069 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3071 return LHS->isIdenticalTo(RHS);
3075 } // end anonymous namespace
3077 ///Perform cse of induction variable instructions.
3078 static void cse(BasicBlock *BB) {
3079 // Perform simple cse.
3080 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3081 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3082 Instruction *In = &*I++;
3084 if (!CSEDenseMapInfo::canHandle(In))
3087 // Check if we can replace this instruction with any of the
3088 // visited instructions.
3089 if (Instruction *V = CSEMap.lookup(In)) {
3090 In->replaceAllUsesWith(V);
3091 In->eraseFromParent();
3099 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3101 bool &NeedToScalarize) {
3102 Function *F = CI->getCalledFunction();
3103 StringRef FnName = CI->getCalledFunction()->getName();
3104 Type *ScalarRetTy = CI->getType();
3105 SmallVector<Type *, 4> Tys, ScalarTys;
3106 for (auto &ArgOp : CI->arg_operands())
3107 ScalarTys.push_back(ArgOp->getType());
3109 // Estimate cost of scalarized vector call. The source operands are assumed
3110 // to be vectors, so we need to extract individual elements from there,
3111 // execute VF scalar calls, and then gather the result into the vector return
3113 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3115 return ScalarCallCost;
3117 // Compute corresponding vector type for return value and arguments.
3118 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3119 for (Type *ScalarTy : ScalarTys)
3120 Tys.push_back(ToVectorTy(ScalarTy, VF));
3122 // Compute costs of unpacking argument values for the scalar calls and
3123 // packing the return values to a vector.
3124 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3126 unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3128 // If we can't emit a vector call for this function, then the currently found
3129 // cost is the cost we need to return.
3130 NeedToScalarize = true;
3131 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3134 // If the corresponding vector cost is cheaper, return its cost.
3135 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3136 if (VectorCallCost < Cost) {
3137 NeedToScalarize = false;
3138 return VectorCallCost;
3143 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3145 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3146 assert(ID && "Expected intrinsic call!");
3149 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3150 FMF = FPMO->getFastMathFlags();
3152 SmallVector<Value *, 4> Operands(CI->arg_operands());
3153 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3156 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3157 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3158 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3159 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3161 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3162 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3163 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3164 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3167 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3168 // For every instruction `I` in MinBWs, truncate the operands, create a
3169 // truncated version of `I` and reextend its result. InstCombine runs
3170 // later and will remove any ext/trunc pairs.
3171 SmallPtrSet<Value *, 4> Erased;
3172 for (const auto &KV : Cost->getMinimalBitwidths()) {
3173 // If the value wasn't vectorized, we must maintain the original scalar
3174 // type. The absence of the value from VectorLoopValueMap indicates that it
3175 // wasn't vectorized.
3176 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3178 for (unsigned Part = 0; Part < UF; ++Part) {
3179 Value *I = getOrCreateVectorValue(KV.first, Part);
3180 if (Erased.find(I) != Erased.end() || I->use_empty() ||
3181 !isa<Instruction>(I))
3183 Type *OriginalTy = I->getType();
3184 Type *ScalarTruncatedTy =
3185 IntegerType::get(OriginalTy->getContext(), KV.second);
3186 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3187 OriginalTy->getVectorNumElements());
3188 if (TruncatedTy == OriginalTy)
3191 IRBuilder<> B(cast<Instruction>(I));
3192 auto ShrinkOperand = [&](Value *V) -> Value * {
3193 if (auto *ZI = dyn_cast<ZExtInst>(V))
3194 if (ZI->getSrcTy() == TruncatedTy)
3195 return ZI->getOperand(0);
3196 return B.CreateZExtOrTrunc(V, TruncatedTy);
3199 // The actual instruction modification depends on the instruction type,
3201 Value *NewI = nullptr;
3202 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3203 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3204 ShrinkOperand(BO->getOperand(1)));
3206 // Any wrapping introduced by shrinking this operation shouldn't be
3207 // considered undefined behavior. So, we can't unconditionally copy
3208 // arithmetic wrapping flags to NewI.
3209 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3210 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3212 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3213 ShrinkOperand(CI->getOperand(1)));
3214 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3215 NewI = B.CreateSelect(SI->getCondition(),
3216 ShrinkOperand(SI->getTrueValue()),
3217 ShrinkOperand(SI->getFalseValue()));
3218 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3219 switch (CI->getOpcode()) {
3221 llvm_unreachable("Unhandled cast!");
3222 case Instruction::Trunc:
3223 NewI = ShrinkOperand(CI->getOperand(0));
3225 case Instruction::SExt:
3226 NewI = B.CreateSExtOrTrunc(
3228 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3230 case Instruction::ZExt:
3231 NewI = B.CreateZExtOrTrunc(
3233 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3236 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3237 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3238 auto *O0 = B.CreateZExtOrTrunc(
3239 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3240 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3241 auto *O1 = B.CreateZExtOrTrunc(
3242 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3244 NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3245 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3246 // Don't do anything with the operands, just extend the result.
3248 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3249 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3250 auto *O0 = B.CreateZExtOrTrunc(
3251 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3252 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3253 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3254 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3255 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3256 auto *O0 = B.CreateZExtOrTrunc(
3257 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3258 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3260 // If we don't know what to do, be conservative and don't do anything.
3264 // Lastly, extend the result.
3265 NewI->takeName(cast<Instruction>(I));
3266 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3267 I->replaceAllUsesWith(Res);
3268 cast<Instruction>(I)->eraseFromParent();
3270 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3274 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3275 for (const auto &KV : Cost->getMinimalBitwidths()) {
3276 // If the value wasn't vectorized, we must maintain the original scalar
3277 // type. The absence of the value from VectorLoopValueMap indicates that it
3278 // wasn't vectorized.
3279 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3281 for (unsigned Part = 0; Part < UF; ++Part) {
3282 Value *I = getOrCreateVectorValue(KV.first, Part);
3283 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3284 if (Inst && Inst->use_empty()) {
3285 Value *NewI = Inst->getOperand(0);
3286 Inst->eraseFromParent();
3287 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3293 void InnerLoopVectorizer::fixVectorizedLoop() {
3294 // Insert truncates and extends for any truncated instructions as hints to
3297 truncateToMinimalBitwidths();
3299 // Fix widened non-induction PHIs by setting up the PHI operands.
3300 if (OrigPHIsToFix.size()) {
3301 assert(EnableVPlanNativePath &&
3302 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3303 fixNonInductionPHIs();
3306 // At this point every instruction in the original loop is widened to a
3307 // vector form. Now we need to fix the recurrences in the loop. These PHI
3308 // nodes are currently empty because we did not want to introduce cycles.
3309 // This is the second stage of vectorizing recurrences.
3310 fixCrossIterationPHIs();
3312 // Update the dominator tree.
3314 // FIXME: After creating the structure of the new loop, the dominator tree is
3315 // no longer up-to-date, and it remains that way until we update it
3316 // here. An out-of-date dominator tree is problematic for SCEV,
3317 // because SCEVExpander uses it to guide code generation. The
3318 // vectorizer use SCEVExpanders in several places. Instead, we should
3319 // keep the dominator tree up-to-date as we go.
3322 // Fix-up external users of the induction variables.
3323 for (auto &Entry : *Legal->getInductionVars())
3324 fixupIVUsers(Entry.first, Entry.second,
3325 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3326 IVEndValues[Entry.first], LoopMiddleBlock);
3329 for (Instruction *PI : PredicatedInstructions)
3330 sinkScalarOperands(&*PI);
3332 // Remove redundant induction instructions.
3333 cse(LoopVectorBody);
3336 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3337 // In order to support recurrences we need to be able to vectorize Phi nodes.
3338 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3339 // stage #2: We now need to fix the recurrences by adding incoming edges to
3340 // the currently empty PHI nodes. At this point every instruction in the
3341 // original loop is widened to a vector form so we can use them to construct
3342 // the incoming edges.
3343 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3344 // Handle first-order recurrences and reductions that need to be fixed.
3345 if (Legal->isFirstOrderRecurrence(&Phi))
3346 fixFirstOrderRecurrence(&Phi);
3347 else if (Legal->isReductionVariable(&Phi))
3352 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3353 // This is the second phase of vectorizing first-order recurrences. An
3354 // overview of the transformation is described below. Suppose we have the
3357 // for (int i = 0; i < n; ++i)
3358 // b[i] = a[i] - a[i - 1];
3360 // There is a first-order recurrence on "a". For this loop, the shorthand
3361 // scalar IR looks like:
3368 // i = phi [0, scalar.ph], [i+1, scalar.body]
3369 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3372 // br cond, scalar.body, ...
3374 // In this example, s1 is a recurrence because it's value depends on the
3375 // previous iteration. In the first phase of vectorization, we created a
3376 // temporary value for s1. We now complete the vectorization and produce the
3377 // shorthand vector IR shown below (for VF = 4, UF = 1).
3380 // v_init = vector(..., ..., ..., a[-1])
3384 // i = phi [0, vector.ph], [i+4, vector.body]
3385 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3386 // v2 = a[i, i+1, i+2, i+3];
3387 // v3 = vector(v1(3), v2(0, 1, 2))
3388 // b[i, i+1, i+2, i+3] = v2 - v3
3389 // br cond, vector.body, middle.block
3396 // s_init = phi [x, middle.block], [a[-1], otherwise]
3399 // After execution completes the vector loop, we extract the next value of
3400 // the recurrence (x) to use as the initial value in the scalar loop.
3402 // Get the original loop preheader and single loop latch.
3403 auto *Preheader = OrigLoop->getLoopPreheader();
3404 auto *Latch = OrigLoop->getLoopLatch();
3406 // Get the initial and previous values of the scalar recurrence.
3407 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3408 auto *Previous = Phi->getIncomingValueForBlock(Latch);
3410 // Create a vector from the initial value.
3411 auto *VectorInit = ScalarInit;
3413 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3414 VectorInit = Builder.CreateInsertElement(
3415 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3416 Builder.getInt32(VF - 1), "vector.recur.init");
3419 // We constructed a temporary phi node in the first phase of vectorization.
3420 // This phi node will eventually be deleted.
3421 Builder.SetInsertPoint(
3422 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3424 // Create a phi node for the new recurrence. The current value will either be
3425 // the initial value inserted into a vector or loop-varying vector value.
3426 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3427 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3429 // Get the vectorized previous value of the last part UF - 1. It appears last
3430 // among all unrolled iterations, due to the order of their construction.
3431 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3433 // Set the insertion point after the previous value if it is an instruction.
3434 // Note that the previous value may have been constant-folded so it is not
3435 // guaranteed to be an instruction in the vector loop. Also, if the previous
3436 // value is a phi node, we should insert after all the phi nodes to avoid
3437 // breaking basic block verification.
3438 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3439 isa<PHINode>(PreviousLastPart))
3440 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3442 Builder.SetInsertPoint(
3443 &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3445 // We will construct a vector for the recurrence by combining the values for
3446 // the current and previous iterations. This is the required shuffle mask.
3447 SmallVector<Constant *, 8> ShuffleMask(VF);
3448 ShuffleMask[0] = Builder.getInt32(VF - 1);
3449 for (unsigned I = 1; I < VF; ++I)
3450 ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3452 // The vector from which to take the initial value for the current iteration
3453 // (actual or unrolled). Initially, this is the vector phi node.
3454 Value *Incoming = VecPhi;
3456 // Shuffle the current and previous vector and update the vector parts.
3457 for (unsigned Part = 0; Part < UF; ++Part) {
3458 Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3459 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3461 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3462 ConstantVector::get(ShuffleMask))
3464 PhiPart->replaceAllUsesWith(Shuffle);
3465 cast<Instruction>(PhiPart)->eraseFromParent();
3466 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3467 Incoming = PreviousPart;
3470 // Fix the latch value of the new recurrence in the vector loop.
3471 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3473 // Extract the last vector element in the middle block. This will be the
3474 // initial value for the recurrence when jumping to the scalar loop.
3475 auto *ExtractForScalar = Incoming;
3477 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3478 ExtractForScalar = Builder.CreateExtractElement(
3479 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3481 // Extract the second last element in the middle block if the
3482 // Phi is used outside the loop. We need to extract the phi itself
3483 // and not the last element (the phi update in the current iteration). This
3484 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3485 // when the scalar loop is not run at all.
3486 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3488 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3489 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3490 // When loop is unrolled without vectorizing, initialize
3491 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3492 // `Incoming`. This is analogous to the vectorized case above: extracting the
3493 // second last element when VF > 1.
3495 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3497 // Fix the initial value of the original recurrence in the scalar loop.
3498 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3499 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3500 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3501 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3502 Start->addIncoming(Incoming, BB);
3505 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3506 Phi->setName("scalar.recur");
3508 // Finally, fix users of the recurrence outside the loop. The users will need
3509 // either the last value of the scalar recurrence or the last value of the
3510 // vector recurrence we extracted in the middle block. Since the loop is in
3511 // LCSSA form, we just need to find all the phi nodes for the original scalar
3512 // recurrence in the exit block, and then add an edge for the middle block.
3513 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3514 if (LCSSAPhi.getIncomingValue(0) == Phi) {
3515 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3520 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3521 Constant *Zero = Builder.getInt32(0);
3523 // Get it's reduction variable descriptor.
3524 assert(Legal->isReductionVariable(Phi) &&
3525 "Unable to find the reduction variable");
3526 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3528 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3529 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3530 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3531 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3532 RdxDesc.getMinMaxRecurrenceKind();
3533 setDebugLocFromInst(Builder, ReductionStartValue);
3535 // We need to generate a reduction vector from the incoming scalar.
3536 // To do so, we need to generate the 'identity' vector and override
3537 // one of the elements with the incoming scalar reduction. We need
3538 // to do it in the vector-loop preheader.
3539 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3541 // This is the vector-clone of the value that leaves the loop.
3542 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3544 // Find the reduction identity variable. Zero for addition, or, xor,
3545 // one for multiplication, -1 for And.
3548 if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3549 RK == RecurrenceDescriptor::RK_FloatMinMax) {
3550 // MinMax reduction have the start value as their identify.
3552 VectorStart = Identity = ReductionStartValue;
3554 VectorStart = Identity =
3555 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3558 // Handle other reduction kinds:
3559 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3560 RK, VecTy->getScalarType());
3563 // This vector is the Identity vector where the first element is the
3564 // incoming scalar reduction.
3565 VectorStart = ReductionStartValue;
3567 Identity = ConstantVector::getSplat(VF, Iden);
3569 // This vector is the Identity vector where the first element is the
3570 // incoming scalar reduction.
3572 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3576 // Fix the vector-loop phi.
3578 // Reductions do not have to start at zero. They can start with
3579 // any loop invariant values.
3580 BasicBlock *Latch = OrigLoop->getLoopLatch();
3581 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3582 for (unsigned Part = 0; Part < UF; ++Part) {
3583 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3584 Value *Val = getOrCreateVectorValue(LoopVal, Part);
3585 // Make sure to add the reduction stat value only to the
3586 // first unroll part.
3587 Value *StartVal = (Part == 0) ? VectorStart : Identity;
3588 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3589 cast<PHINode>(VecRdxPhi)
3590 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3593 // Before each round, move the insertion point right between
3594 // the PHIs and the values we are going to write.
3595 // This allows us to write both PHINodes and the extractelement
3597 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3599 setDebugLocFromInst(Builder, LoopExitInst);
3601 // If the vector reduction can be performed in a smaller type, we truncate
3602 // then extend the loop exit value to enable InstCombine to evaluate the
3603 // entire expression in the smaller type.
3604 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3605 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3606 Builder.SetInsertPoint(
3607 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3608 VectorParts RdxParts(UF);
3609 for (unsigned Part = 0; Part < UF; ++Part) {
3610 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3611 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3612 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3613 : Builder.CreateZExt(Trunc, VecTy);
3614 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3615 UI != RdxParts[Part]->user_end();)
3617 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3618 RdxParts[Part] = Extnd;
3623 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3624 for (unsigned Part = 0; Part < UF; ++Part) {
3625 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3626 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3630 // Reduce all of the unrolled parts into a single vector.
3631 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3632 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3634 // The middle block terminator has already been assigned a DebugLoc here (the
3635 // OrigLoop's single latch terminator). We want the whole middle block to
3636 // appear to execute on this line because: (a) it is all compiler generated,
3637 // (b) these instructions are always executed after evaluating the latch
3638 // conditional branch, and (c) other passes may add new predecessors which
3639 // terminate on this line. This is the easiest way to ensure we don't
3640 // accidentally cause an extra step back into the loop while debugging.
3641 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3642 for (unsigned Part = 1; Part < UF; ++Part) {
3643 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3644 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3645 // Floating point operations had to be 'fast' to enable the reduction.
3646 ReducedPartRdx = addFastMathFlag(
3647 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3648 ReducedPartRdx, "bin.rdx"),
3649 RdxDesc.getFastMathFlags());
3651 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3656 bool NoNaN = Legal->hasFunNoNaNAttr();
3658 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3659 // If the reduction can be performed in a smaller type, we need to extend
3660 // the reduction to the wider type before we branch to the original loop.
3661 if (Phi->getType() != RdxDesc.getRecurrenceType())
3664 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3665 : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3668 // Create a phi node that merges control-flow from the backedge-taken check
3669 // block and the middle block.
3670 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3671 LoopScalarPreHeader->getTerminator());
3672 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3673 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3674 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3676 // Now, we need to fix the users of the reduction variable
3677 // inside and outside of the scalar remainder loop.
3678 // We know that the loop is in LCSSA form. We need to update the
3679 // PHI nodes in the exit blocks.
3680 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3681 // All PHINodes need to have a single entry edge, or two if
3682 // we already fixed them.
3683 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3685 // We found a reduction value exit-PHI. Update it with the
3686 // incoming bypass edge.
3687 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3688 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3689 } // end of the LCSSA phi scan.
3691 // Fix the scalar loop reduction variable with the incoming reduction sum
3692 // from the vector body and from the backedge value.
3693 int IncomingEdgeBlockIdx =
3694 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3695 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3696 // Pick the other block.
3697 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3698 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3699 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3702 void InnerLoopVectorizer::fixLCSSAPHIs() {
3703 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3704 if (LCSSAPhi.getNumIncomingValues() == 1) {
3705 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3706 // Non-instruction incoming values will have only one value.
3707 unsigned LastLane = 0;
3708 if (isa<Instruction>(IncomingValue))
3709 LastLane = Cost->isUniformAfterVectorization(
3710 cast<Instruction>(IncomingValue), VF)
3713 // Can be a loop invariant incoming value or the last scalar value to be
3714 // extracted from the vectorized loop.
3715 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3716 Value *lastIncomingValue =
3717 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3718 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3723 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3724 // The basic block and loop containing the predicated instruction.
3725 auto *PredBB = PredInst->getParent();
3726 auto *VectorLoop = LI->getLoopFor(PredBB);
3728 // Initialize a worklist with the operands of the predicated instruction.
3729 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3731 // Holds instructions that we need to analyze again. An instruction may be
3732 // reanalyzed if we don't yet know if we can sink it or not.
3733 SmallVector<Instruction *, 8> InstsToReanalyze;
3735 // Returns true if a given use occurs in the predicated block. Phi nodes use
3736 // their operands in their corresponding predecessor blocks.
3737 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3738 auto *I = cast<Instruction>(U.getUser());
3739 BasicBlock *BB = I->getParent();
3740 if (auto *Phi = dyn_cast<PHINode>(I))
3741 BB = Phi->getIncomingBlock(
3742 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3743 return BB == PredBB;
3746 // Iteratively sink the scalarized operands of the predicated instruction
3747 // into the block we created for it. When an instruction is sunk, it's
3748 // operands are then added to the worklist. The algorithm ends after one pass
3749 // through the worklist doesn't sink a single instruction.
3752 // Add the instructions that need to be reanalyzed to the worklist, and
3753 // reset the changed indicator.
3754 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3755 InstsToReanalyze.clear();
3758 while (!Worklist.empty()) {
3759 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3761 // We can't sink an instruction if it is a phi node, is already in the
3762 // predicated block, is not in the loop, or may have side effects.
3763 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3764 !VectorLoop->contains(I) || I->mayHaveSideEffects())
3767 // It's legal to sink the instruction if all its uses occur in the
3768 // predicated block. Otherwise, there's nothing to do yet, and we may
3769 // need to reanalyze the instruction.
3770 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3771 InstsToReanalyze.push_back(I);
3775 // Move the instruction to the beginning of the predicated block, and add
3776 // it's operands to the worklist.
3777 I->moveBefore(&*PredBB->getFirstInsertionPt());
3778 Worklist.insert(I->op_begin(), I->op_end());
3780 // The sinking may have enabled other instructions to be sunk, so we will
3787 void InnerLoopVectorizer::fixNonInductionPHIs() {
3788 for (PHINode *OrigPhi : OrigPHIsToFix) {
3790 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3791 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3793 SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3794 predecessors(OrigPhi->getParent()));
3795 SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3796 predecessors(NewPhi->getParent()));
3797 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3798 "Scalar and Vector BB should have the same number of predecessors");
3800 // The insertion point in Builder may be invalidated by the time we get
3801 // here. Force the Builder insertion point to something valid so that we do
3802 // not run into issues during insertion point restore in
3803 // getOrCreateVectorValue calls below.
3804 Builder.SetInsertPoint(NewPhi);
3806 // The predecessor order is preserved and we can rely on mapping between
3807 // scalar and vector block predecessors.
3808 for (unsigned i = 0; i < NumIncomingValues; ++i) {
3809 BasicBlock *NewPredBB = VectorBBPredecessors[i];
3811 // When looking up the new scalar/vector values to fix up, use incoming
3812 // values from original phi.
3814 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3816 // Scalar incoming value may need a broadcast
3817 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3818 NewPhi->addIncoming(NewIncV, NewPredBB);
3823 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3825 PHINode *P = cast<PHINode>(PN);
3826 if (EnableVPlanNativePath) {
3827 // Currently we enter here in the VPlan-native path for non-induction
3828 // PHIs where all control flow is uniform. We simply widen these PHIs.
3829 // Create a vector phi with no operands - the vector phi operands will be
3830 // set at the end of vector code generation.
3832 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3833 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3834 VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3835 OrigPHIsToFix.push_back(P);
3840 assert(PN->getParent() == OrigLoop->getHeader() &&
3841 "Non-header phis should have been handled elsewhere");
3843 // In order to support recurrences we need to be able to vectorize Phi nodes.
3844 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3845 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3846 // this value when we vectorize all of the instructions that use the PHI.
3847 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3848 for (unsigned Part = 0; Part < UF; ++Part) {
3849 // This is phase one of vectorizing PHIs.
3851 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3852 Value *EntryPart = PHINode::Create(
3853 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3854 VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3859 setDebugLocFromInst(Builder, P);
3861 // This PHINode must be an induction variable.
3862 // Make sure that we know about it.
3863 assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3865 InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3866 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3868 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3869 // which can be found from the original scalar operations.
3870 switch (II.getKind()) {
3871 case InductionDescriptor::IK_NoInduction:
3872 llvm_unreachable("Unknown induction");
3873 case InductionDescriptor::IK_IntInduction:
3874 case InductionDescriptor::IK_FpInduction:
3875 llvm_unreachable("Integer/fp induction is handled elsewhere.");
3876 case InductionDescriptor::IK_PtrInduction: {
3877 // Handle the pointer induction variable case.
3878 assert(P->getType()->isPointerTy() && "Unexpected type.");
3879 // This is the normalized GEP that starts counting at zero.
3880 Value *PtrInd = Induction;
3881 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3882 // Determine the number of scalars we need to generate for each unroll
3883 // iteration. If the instruction is uniform, we only need to generate the
3884 // first lane. Otherwise, we generate all VF values.
3885 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3886 // These are the scalar results. Notice that we don't generate vector GEPs
3887 // because scalar GEPs result in better code.
3888 for (unsigned Part = 0; Part < UF; ++Part) {
3889 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3890 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3891 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3893 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3894 SclrGep->setName("next.gep");
3895 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3903 /// A helper function for checking whether an integer division-related
3904 /// instruction may divide by zero (in which case it must be predicated if
3905 /// executed conditionally in the scalar code).
3906 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3907 /// Non-zero divisors that are non compile-time constants will not be
3908 /// converted into multiplication, so we will still end up scalarizing
3909 /// the division, but can do so w/o predication.
3910 static bool mayDivideByZero(Instruction &I) {
3911 assert((I.getOpcode() == Instruction::UDiv ||
3912 I.getOpcode() == Instruction::SDiv ||
3913 I.getOpcode() == Instruction::URem ||
3914 I.getOpcode() == Instruction::SRem) &&
3915 "Unexpected instruction");
3916 Value *Divisor = I.getOperand(1);
3917 auto *CInt = dyn_cast<ConstantInt>(Divisor);
3918 return !CInt || CInt->isZero();
3921 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
3922 switch (I.getOpcode()) {
3923 case Instruction::Br:
3924 case Instruction::PHI:
3925 llvm_unreachable("This instruction is handled by a different recipe.");
3926 case Instruction::GetElementPtr: {
3927 // Construct a vector GEP by widening the operands of the scalar GEP as
3928 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3929 // results in a vector of pointers when at least one operand of the GEP
3930 // is vector-typed. Thus, to keep the representation compact, we only use
3931 // vector-typed operands for loop-varying values.
3932 auto *GEP = cast<GetElementPtrInst>(&I);
3934 if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3935 // If we are vectorizing, but the GEP has only loop-invariant operands,
3936 // the GEP we build (by only using vector-typed operands for
3937 // loop-varying values) would be a scalar pointer. Thus, to ensure we
3938 // produce a vector of pointers, we need to either arbitrarily pick an
3939 // operand to broadcast, or broadcast a clone of the original GEP.
3940 // Here, we broadcast a clone of the original.
3942 // TODO: If at some point we decide to scalarize instructions having
3943 // loop-invariant operands, this special case will no longer be
3944 // required. We would add the scalarization decision to
3945 // collectLoopScalars() and teach getVectorValue() to broadcast
3946 // the lane-zero scalar value.
3947 auto *Clone = Builder.Insert(GEP->clone());
3948 for (unsigned Part = 0; Part < UF; ++Part) {
3949 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3950 VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3951 addMetadata(EntryPart, GEP);
3954 // If the GEP has at least one loop-varying operand, we are sure to
3955 // produce a vector of pointers. But if we are only unrolling, we want
3956 // to produce a scalar GEP for each unroll part. Thus, the GEP we
3957 // produce with the code below will be scalar (if VF == 1) or vector
3958 // (otherwise). Note that for the unroll-only case, we still maintain
3959 // values in the vector mapping with initVector, as we do for other
3961 for (unsigned Part = 0; Part < UF; ++Part) {
3962 // The pointer operand of the new GEP. If it's loop-invariant, we
3963 // won't broadcast it.
3965 OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3966 ? GEP->getPointerOperand()
3967 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
3969 // Collect all the indices for the new GEP. If any index is
3970 // loop-invariant, we won't broadcast it.
3971 SmallVector<Value *, 4> Indices;
3972 for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3973 if (OrigLoop->isLoopInvariant(U.get()))
3974 Indices.push_back(U.get());
3976 Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3979 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3980 // but it should be a vector, otherwise.
3983 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
3985 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
3986 assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3987 "NewGEP is not a pointer vector");
3988 VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3989 addMetadata(NewGEP, GEP);
3995 case Instruction::UDiv:
3996 case Instruction::SDiv:
3997 case Instruction::SRem:
3998 case Instruction::URem:
3999 case Instruction::Add:
4000 case Instruction::FAdd:
4001 case Instruction::Sub:
4002 case Instruction::FSub:
4003 case Instruction::FNeg:
4004 case Instruction::Mul:
4005 case Instruction::FMul:
4006 case Instruction::FDiv:
4007 case Instruction::FRem:
4008 case Instruction::Shl:
4009 case Instruction::LShr:
4010 case Instruction::AShr:
4011 case Instruction::And:
4012 case Instruction::Or:
4013 case Instruction::Xor: {
4014 // Just widen unops and binops.
4015 setDebugLocFromInst(Builder, &I);
4017 for (unsigned Part = 0; Part < UF; ++Part) {
4018 SmallVector<Value *, 2> Ops;
4019 for (Value *Op : I.operands())
4020 Ops.push_back(getOrCreateVectorValue(Op, Part));
4022 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4024 if (auto *VecOp = dyn_cast<Instruction>(V))
4025 VecOp->copyIRFlags(&I);
4027 // Use this vector value for all users of the original instruction.
4028 VectorLoopValueMap.setVectorValue(&I, Part, V);
4034 case Instruction::Select: {
4036 // If the selector is loop invariant we can create a select
4037 // instruction with a scalar condition. Otherwise, use vector-select.
4038 auto *SE = PSE.getSE();
4039 bool InvariantCond =
4040 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4041 setDebugLocFromInst(Builder, &I);
4043 // The condition can be loop invariant but still defined inside the
4044 // loop. This means that we can't just use the original 'cond' value.
4045 // We have to take the 'vectorized' value and pick the first lane.
4046 // Instcombine will make this a no-op.
4048 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4050 for (unsigned Part = 0; Part < UF; ++Part) {
4051 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4052 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4053 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4055 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4056 VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4057 addMetadata(Sel, &I);
4063 case Instruction::ICmp:
4064 case Instruction::FCmp: {
4065 // Widen compares. Generate vector compares.
4066 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4067 auto *Cmp = dyn_cast<CmpInst>(&I);
4068 setDebugLocFromInst(Builder, Cmp);
4069 for (unsigned Part = 0; Part < UF; ++Part) {
4070 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4071 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4074 // Propagate fast math flags.
4075 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4076 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4077 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4079 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4081 VectorLoopValueMap.setVectorValue(&I, Part, C);
4088 case Instruction::ZExt:
4089 case Instruction::SExt:
4090 case Instruction::FPToUI:
4091 case Instruction::FPToSI:
4092 case Instruction::FPExt:
4093 case Instruction::PtrToInt:
4094 case Instruction::IntToPtr:
4095 case Instruction::SIToFP:
4096 case Instruction::UIToFP:
4097 case Instruction::Trunc:
4098 case Instruction::FPTrunc:
4099 case Instruction::BitCast: {
4100 auto *CI = dyn_cast<CastInst>(&I);
4101 setDebugLocFromInst(Builder, CI);
4103 /// Vectorize casts.
4105 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4107 for (unsigned Part = 0; Part < UF; ++Part) {
4108 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4109 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4110 VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4111 addMetadata(Cast, &I);
4116 case Instruction::Call: {
4117 // Ignore dbg intrinsics.
4118 if (isa<DbgInfoIntrinsic>(I))
4120 setDebugLocFromInst(Builder, &I);
4122 Module *M = I.getParent()->getParent()->getParent();
4123 auto *CI = cast<CallInst>(&I);
4125 StringRef FnName = CI->getCalledFunction()->getName();
4126 Function *F = CI->getCalledFunction();
4127 Type *RetTy = ToVectorTy(CI->getType(), VF);
4128 SmallVector<Type *, 4> Tys;
4129 for (Value *ArgOperand : CI->arg_operands())
4130 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4132 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4134 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4135 // version of the instruction.
4136 // Is it beneficial to perform intrinsic call compared to lib call?
4137 bool NeedToScalarize;
4138 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4139 bool UseVectorIntrinsic =
4140 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4141 assert((UseVectorIntrinsic || !NeedToScalarize) &&
4142 "Instruction should be scalarized elsewhere.");
4144 for (unsigned Part = 0; Part < UF; ++Part) {
4145 SmallVector<Value *, 4> Args;
4146 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4147 Value *Arg = CI->getArgOperand(i);
4148 // Some intrinsics have a scalar argument - don't replace it with a
4150 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4151 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4152 Args.push_back(Arg);
4156 if (UseVectorIntrinsic) {
4157 // Use vector version of the intrinsic.
4158 Type *TysForDecl[] = {CI->getType()};
4160 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4161 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4163 // Use vector version of the library call.
4164 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4165 assert(!VFnName.empty() && "Vector function name is empty.");
4166 VectorF = M->getFunction(VFnName);
4168 // Generate a declaration
4169 FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4171 Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4172 VectorF->copyAttributesFrom(F);
4175 assert(VectorF && "Can't create vector function.");
4177 SmallVector<OperandBundleDef, 1> OpBundles;
4178 CI->getOperandBundlesAsDefs(OpBundles);
4179 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4181 if (isa<FPMathOperator>(V))
4182 V->copyFastMathFlags(CI);
4184 VectorLoopValueMap.setVectorValue(&I, Part, V);
4192 // This instruction is not vectorized by simple widening.
4193 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4194 llvm_unreachable("Unhandled instruction!");
4198 void InnerLoopVectorizer::updateAnalysis() {
4199 // Forget the original basic block.
4200 PSE.getSE()->forgetLoop(OrigLoop);
4202 // DT is not kept up-to-date for outer loop vectorization
4203 if (EnableVPlanNativePath)
4206 // Update the dominator tree information.
4207 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4208 "Entry does not dominate exit.");
4210 DT->addNewBlock(LoopMiddleBlock,
4211 LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4212 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4213 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4214 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4215 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4218 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4219 // We should not collect Scalars more than once per VF. Right now, this
4220 // function is called from collectUniformsAndScalars(), which already does
4221 // this check. Collecting Scalars for VF=1 does not make any sense.
4222 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4223 "This function should not be visited twice for the same VF");
4225 SmallSetVector<Instruction *, 8> Worklist;
4227 // These sets are used to seed the analysis with pointers used by memory
4228 // accesses that will remain scalar.
4229 SmallSetVector<Instruction *, 8> ScalarPtrs;
4230 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4232 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4233 // The pointer operands of loads and stores will be scalar as long as the
4234 // memory access is not a gather or scatter operation. The value operand of a
4235 // store will remain scalar if the store is scalarized.
4236 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4237 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4238 assert(WideningDecision != CM_Unknown &&
4239 "Widening decision should be ready at this moment");
4240 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4241 if (Ptr == Store->getValueOperand())
4242 return WideningDecision == CM_Scalarize;
4243 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4244 "Ptr is neither a value or pointer operand");
4245 return WideningDecision != CM_GatherScatter;
4248 // A helper that returns true if the given value is a bitcast or
4249 // getelementptr instruction contained in the loop.
4250 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4251 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4252 isa<GetElementPtrInst>(V)) &&
4253 !TheLoop->isLoopInvariant(V);
4256 // A helper that evaluates a memory access's use of a pointer. If the use
4257 // will be a scalar use, and the pointer is only used by memory accesses, we
4258 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4259 // PossibleNonScalarPtrs.
4260 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4261 // We only care about bitcast and getelementptr instructions contained in
4263 if (!isLoopVaryingBitCastOrGEP(Ptr))
4266 // If the pointer has already been identified as scalar (e.g., if it was
4267 // also identified as uniform), there's nothing to do.
4268 auto *I = cast<Instruction>(Ptr);
4269 if (Worklist.count(I))
4272 // If the use of the pointer will be a scalar use, and all users of the
4273 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4274 // place the pointer in PossibleNonScalarPtrs.
4275 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4276 return isa<LoadInst>(U) || isa<StoreInst>(U);
4278 ScalarPtrs.insert(I);
4280 PossibleNonScalarPtrs.insert(I);
4283 // We seed the scalars analysis with three classes of instructions: (1)
4284 // instructions marked uniform-after-vectorization, (2) bitcast and
4285 // getelementptr instructions used by memory accesses requiring a scalar use,
4286 // and (3) pointer induction variables and their update instructions (we
4287 // currently only scalarize these).
4289 // (1) Add to the worklist all instructions that have been identified as
4290 // uniform-after-vectorization.
4291 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4293 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4294 // memory accesses requiring a scalar use. The pointer operands of loads and
4295 // stores will be scalar as long as the memory accesses is not a gather or
4296 // scatter operation. The value operand of a store will remain scalar if the
4297 // store is scalarized.
4298 for (auto *BB : TheLoop->blocks())
4299 for (auto &I : *BB) {
4300 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4301 evaluatePtrUse(Load, Load->getPointerOperand());
4302 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4303 evaluatePtrUse(Store, Store->getPointerOperand());
4304 evaluatePtrUse(Store, Store->getValueOperand());
4307 for (auto *I : ScalarPtrs)
4308 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4309 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4313 // (3) Add to the worklist all pointer induction variables and their update
4316 // TODO: Once we are able to vectorize pointer induction variables we should
4317 // no longer insert them into the worklist here.
4318 auto *Latch = TheLoop->getLoopLatch();
4319 for (auto &Induction : *Legal->getInductionVars()) {
4320 auto *Ind = Induction.first;
4321 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4322 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4324 Worklist.insert(Ind);
4325 Worklist.insert(IndUpdate);
4326 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4327 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4331 // Insert the forced scalars.
4332 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4333 // induction variable when the PHI user is scalarized.
4334 auto ForcedScalar = ForcedScalars.find(VF);
4335 if (ForcedScalar != ForcedScalars.end())
4336 for (auto *I : ForcedScalar->second)
4339 // Expand the worklist by looking through any bitcasts and getelementptr
4340 // instructions we've already identified as scalar. This is similar to the
4341 // expansion step in collectLoopUniforms(); however, here we're only
4342 // expanding to include additional bitcasts and getelementptr instructions.
4344 while (Idx != Worklist.size()) {
4345 Instruction *Dst = Worklist[Idx++];
4346 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4348 auto *Src = cast<Instruction>(Dst->getOperand(0));
4349 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4350 auto *J = cast<Instruction>(U);
4351 return !TheLoop->contains(J) || Worklist.count(J) ||
4352 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4353 isScalarUse(J, Src));
4355 Worklist.insert(Src);
4356 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4360 // An induction variable will remain scalar if all users of the induction
4361 // variable and induction variable update remain scalar.
4362 for (auto &Induction : *Legal->getInductionVars()) {
4363 auto *Ind = Induction.first;
4364 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4366 // We already considered pointer induction variables, so there's no reason
4367 // to look at their users again.
4369 // TODO: Once we are able to vectorize pointer induction variables we
4370 // should no longer skip over them here.
4371 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4374 // Determine if all users of the induction variable are scalar after
4376 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4377 auto *I = cast<Instruction>(U);
4378 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4383 // Determine if all users of the induction variable update instruction are
4384 // scalar after vectorization.
4385 auto ScalarIndUpdate =
4386 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4387 auto *I = cast<Instruction>(U);
4388 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4390 if (!ScalarIndUpdate)
4393 // The induction variable and its update instruction will remain scalar.
4394 Worklist.insert(Ind);
4395 Worklist.insert(IndUpdate);
4396 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4397 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4401 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4404 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4405 if (!blockNeedsPredication(I->getParent()))
4407 switch(I->getOpcode()) {
4410 case Instruction::Load:
4411 case Instruction::Store: {
4412 if (!Legal->isMaskRequired(I))
4414 auto *Ptr = getLoadStorePointerOperand(I);
4415 auto *Ty = getMemInstValueType(I);
4416 // We have already decided how to vectorize this instruction, get that
4419 InstWidening WideningDecision = getWideningDecision(I, VF);
4420 assert(WideningDecision != CM_Unknown &&
4421 "Widening decision should be ready at this moment");
4422 return WideningDecision == CM_Scalarize;
4424 return isa<LoadInst>(I) ?
4425 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4426 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4428 case Instruction::UDiv:
4429 case Instruction::SDiv:
4430 case Instruction::SRem:
4431 case Instruction::URem:
4432 return mayDivideByZero(*I);
4437 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4439 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4440 assert(getWideningDecision(I, VF) == CM_Unknown &&
4441 "Decision should not be set yet.");
4442 auto *Group = getInterleavedAccessGroup(I);
4443 assert(Group && "Must have a group.");
4445 // If the instruction's allocated size doesn't equal it's type size, it
4446 // requires padding and will be scalarized.
4447 auto &DL = I->getModule()->getDataLayout();
4448 auto *ScalarTy = getMemInstValueType(I);
4449 if (hasIrregularType(ScalarTy, DL, VF))
4452 // Check if masking is required.
4453 // A Group may need masking for one of two reasons: it resides in a block that
4454 // needs predication, or it was decided to use masking to deal with gaps.
4455 bool PredicatedAccessRequiresMasking =
4456 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4457 bool AccessWithGapsRequiresMasking =
4458 Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
4459 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4462 // If masked interleaving is required, we expect that the user/target had
4463 // enabled it, because otherwise it either wouldn't have been created or
4464 // it should have been invalidated by the CostModel.
4465 assert(useMaskedInterleavedAccesses(TTI) &&
4466 "Masked interleave-groups for predicated accesses are not enabled.");
4468 auto *Ty = getMemInstValueType(I);
4469 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4470 : TTI.isLegalMaskedStore(Ty);
4473 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4475 // Get and ensure we have a valid memory instruction.
4476 LoadInst *LI = dyn_cast<LoadInst>(I);
4477 StoreInst *SI = dyn_cast<StoreInst>(I);
4478 assert((LI || SI) && "Invalid memory instruction");
4480 auto *Ptr = getLoadStorePointerOperand(I);
4482 // In order to be widened, the pointer should be consecutive, first of all.
4483 if (!Legal->isConsecutivePtr(Ptr))
4486 // If the instruction is a store located in a predicated block, it will be
4488 if (isScalarWithPredication(I))
4491 // If the instruction's allocated size doesn't equal it's type size, it
4492 // requires padding and will be scalarized.
4493 auto &DL = I->getModule()->getDataLayout();
4494 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4495 if (hasIrregularType(ScalarTy, DL, VF))
4501 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4502 // We should not collect Uniforms more than once per VF. Right now,
4503 // this function is called from collectUniformsAndScalars(), which
4504 // already does this check. Collecting Uniforms for VF=1 does not make any
4507 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4508 "This function should not be visited twice for the same VF");
4510 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4511 // not analyze again. Uniforms.count(VF) will return 1.
4512 Uniforms[VF].clear();
4514 // We now know that the loop is vectorizable!
4515 // Collect instructions inside the loop that will remain uniform after
4518 // Global values, params and instructions outside of current loop are out of
4520 auto isOutOfScope = [&](Value *V) -> bool {
4521 Instruction *I = dyn_cast<Instruction>(V);
4522 return (!I || !TheLoop->contains(I));
4525 SetVector<Instruction *> Worklist;
4526 BasicBlock *Latch = TheLoop->getLoopLatch();
4528 // Start with the conditional branch. If the branch condition is an
4529 // instruction contained in the loop that is only used by the branch, it is
4531 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4532 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4533 Worklist.insert(Cmp);
4534 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4537 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4538 // are pointers that are treated like consecutive pointers during
4539 // vectorization. The pointer operands of interleaved accesses are an
4541 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4543 // Holds pointer operands of instructions that are possibly non-uniform.
4544 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4546 auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4547 InstWidening WideningDecision = getWideningDecision(I, VF);
4548 assert(WideningDecision != CM_Unknown &&
4549 "Widening decision should be ready at this moment");
4551 return (WideningDecision == CM_Widen ||
4552 WideningDecision == CM_Widen_Reverse ||
4553 WideningDecision == CM_Interleave);
4555 // Iterate over the instructions in the loop, and collect all
4556 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4557 // that a consecutive-like pointer operand will be scalarized, we collect it
4558 // in PossibleNonUniformPtrs instead. We use two sets here because a single
4559 // getelementptr instruction can be used by both vectorized and scalarized
4560 // memory instructions. For example, if a loop loads and stores from the same
4561 // location, but the store is conditional, the store will be scalarized, and
4562 // the getelementptr won't remain uniform.
4563 for (auto *BB : TheLoop->blocks())
4564 for (auto &I : *BB) {
4565 // If there's no pointer operand, there's nothing to do.
4566 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4570 // True if all users of Ptr are memory accesses that have Ptr as their
4572 auto UsersAreMemAccesses =
4573 llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4574 return getLoadStorePointerOperand(U) == Ptr;
4577 // Ensure the memory instruction will not be scalarized or used by
4578 // gather/scatter, making its pointer operand non-uniform. If the pointer
4579 // operand is used by any instruction other than a memory access, we
4580 // conservatively assume the pointer operand may be non-uniform.
4581 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4582 PossibleNonUniformPtrs.insert(Ptr);
4584 // If the memory instruction will be vectorized and its pointer operand
4585 // is consecutive-like, or interleaving - the pointer operand should
4588 ConsecutiveLikePtrs.insert(Ptr);
4591 // Add to the Worklist all consecutive and consecutive-like pointers that
4592 // aren't also identified as possibly non-uniform.
4593 for (auto *V : ConsecutiveLikePtrs)
4594 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4595 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4599 // Expand Worklist in topological order: whenever a new instruction
4600 // is added , its users should be already inside Worklist. It ensures
4601 // a uniform instruction will only be used by uniform instructions.
4603 while (idx != Worklist.size()) {
4604 Instruction *I = Worklist[idx++];
4606 for (auto OV : I->operand_values()) {
4607 // isOutOfScope operands cannot be uniform instructions.
4608 if (isOutOfScope(OV))
4610 // First order recurrence Phi's should typically be considered
4612 auto *OP = dyn_cast<PHINode>(OV);
4613 if (OP && Legal->isFirstOrderRecurrence(OP))
4615 // If all the users of the operand are uniform, then add the
4616 // operand into the uniform worklist.
4617 auto *OI = cast<Instruction>(OV);
4618 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4619 auto *J = cast<Instruction>(U);
4620 return Worklist.count(J) ||
4621 (OI == getLoadStorePointerOperand(J) &&
4622 isUniformDecision(J, VF));
4624 Worklist.insert(OI);
4625 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4630 // Returns true if Ptr is the pointer operand of a memory access instruction
4631 // I, and I is known to not require scalarization.
4632 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4633 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4636 // For an instruction to be added into Worklist above, all its users inside
4637 // the loop should also be in Worklist. However, this condition cannot be
4638 // true for phi nodes that form a cyclic dependence. We must process phi
4639 // nodes separately. An induction variable will remain uniform if all users
4640 // of the induction variable and induction variable update remain uniform.
4641 // The code below handles both pointer and non-pointer induction variables.
4642 for (auto &Induction : *Legal->getInductionVars()) {
4643 auto *Ind = Induction.first;
4644 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4646 // Determine if all users of the induction variable are uniform after
4648 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4649 auto *I = cast<Instruction>(U);
4650 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4651 isVectorizedMemAccessUse(I, Ind);
4656 // Determine if all users of the induction variable update instruction are
4657 // uniform after vectorization.
4658 auto UniformIndUpdate =
4659 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4660 auto *I = cast<Instruction>(U);
4661 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4662 isVectorizedMemAccessUse(I, IndUpdate);
4664 if (!UniformIndUpdate)
4667 // The induction variable and its update instruction will remain uniform.
4668 Worklist.insert(Ind);
4669 Worklist.insert(IndUpdate);
4670 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4671 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4675 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4678 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
4679 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4680 // TODO: It may by useful to do since it's still likely to be dynamically
4681 // uniform if the target can skip.
4683 dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4686 createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4687 << "runtime pointer checks needed. Not enabled for divergent target");
4692 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4693 if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4694 return computeFeasibleMaxVF(OptForSize, TC);
4696 if (Legal->getRuntimePointerChecking()->Need) {
4697 ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4698 << "runtime pointer checks needed. Enable vectorization of this "
4699 "loop with '#pragma clang loop vectorize(enable)' when "
4700 "compiling with -Os/-Oz");
4703 << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4707 if (!PSE.getUnionPredicate().getPredicates().empty()) {
4708 ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4709 << "runtime SCEV checks needed. Enable vectorization of this "
4710 "loop with '#pragma clang loop vectorize(enable)' when "
4711 "compiling with -Os/-Oz");
4714 << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
4718 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4719 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4720 ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4721 << "runtime stride == 1 checks needed. Enable vectorization of "
4722 "this loop with '#pragma clang loop vectorize(enable)' when "
4723 "compiling with -Os/-Oz");
4726 << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
4730 // If we optimize the program for size, avoid creating the tail loop.
4731 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4734 ORE->emit(createMissedAnalysis("SingleIterationLoop")
4735 << "loop trip count is one, irrelevant for vectorization");
4736 LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
4740 // Record that scalar epilogue is not allowed.
4741 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4743 IsScalarEpilogueAllowed = !OptForSize;
4745 // We don't create an epilogue when optimizing for size.
4746 // Invalidate interleave groups that require an epilogue if we can't mask
4747 // the interleave-group.
4748 if (!useMaskedInterleavedAccesses(TTI))
4749 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4751 unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4753 if (TC > 0 && TC % MaxVF == 0) {
4754 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4758 // If we don't know the precise trip count, or if the trip count that we
4759 // found modulo the vectorization factor is not zero, try to fold the tail
4761 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4762 if (Legal->canFoldTailByMasking()) {
4763 FoldTailByMasking = true;
4769 createMissedAnalysis("UnknownLoopCountComplexCFG")
4770 << "unable to calculate the loop count due to complex control flow");
4774 ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4775 << "cannot optimize for size and vectorize at the same time. "
4776 "Enable vectorization of this loop with '#pragma clang loop "
4777 "vectorize(enable)' when compiling with -Os/-Oz");
4782 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4783 unsigned ConstTripCount) {
4784 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4785 unsigned SmallestType, WidestType;
4786 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4787 unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4789 // Get the maximum safe dependence distance in bits computed by LAA.
4790 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4791 // the memory accesses that is most restrictive (involved in the smallest
4792 // dependence distance).
4793 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4795 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4797 unsigned MaxVectorSize = WidestRegister / WidestType;
4799 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4800 << " / " << WidestType << " bits.\n");
4801 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4802 << WidestRegister << " bits.\n");
4804 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4805 " into one vector!");
4806 if (MaxVectorSize == 0) {
4807 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4809 return MaxVectorSize;
4810 } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4811 isPowerOf2_32(ConstTripCount)) {
4812 // We need to clamp the VF to be the ConstTripCount. There is no point in
4813 // choosing a higher viable VF as done in the loop below.
4814 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4815 << ConstTripCount << "\n");
4816 MaxVectorSize = ConstTripCount;
4817 return MaxVectorSize;
4820 unsigned MaxVF = MaxVectorSize;
4821 if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4822 (MaximizeBandwidth && !OptForSize)) {
4823 // Collect all viable vectorization factors larger than the default MaxVF
4824 // (i.e. MaxVectorSize).
4825 SmallVector<unsigned, 8> VFs;
4826 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4827 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4830 // For each VF calculate its register usage.
4831 auto RUs = calculateRegisterUsage(VFs);
4833 // Select the largest VF which doesn't require more registers than existing
4835 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4836 for (int i = RUs.size() - 1; i >= 0; --i) {
4837 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4842 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4843 if (MaxVF < MinVF) {
4844 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4845 << ") with target's minimum: " << MinVF << '\n');
4854 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4855 float Cost = expectedCost(1).first;
4856 const float ScalarCost = Cost;
4858 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4860 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4861 if (ForceVectorization && MaxVF > 1) {
4862 // Ignore scalar width, because the user explicitly wants vectorization.
4863 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4865 Cost = std::numeric_limits<float>::max();
4868 for (unsigned i = 2; i <= MaxVF; i *= 2) {
4869 // Notice that the vector loop needs to be executed less times, so
4870 // we need to divide the cost of the vector loops by the width of
4871 // the vector elements.
4872 VectorizationCostTy C = expectedCost(i);
4873 float VectorCost = C.first / (float)i;
4874 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4875 << " costs: " << (int)VectorCost << ".\n");
4876 if (!C.second && !ForceVectorization) {
4878 dbgs() << "LV: Not considering vector loop of width " << i
4879 << " because it will not generate any vector instructions.\n");
4882 if (VectorCost < Cost) {
4888 if (!EnableCondStoresVectorization && NumPredStores) {
4889 ORE->emit(createMissedAnalysis("ConditionalStore")
4890 << "store that is conditionally executed prevents vectorization");
4892 dbgs() << "LV: No vectorization. There are conditional stores.\n");
4897 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4898 << "LV: Vectorization seems to be not beneficial, "
4899 << "but was forced by a user.\n");
4900 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4901 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4905 std::pair<unsigned, unsigned>
4906 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4907 unsigned MinWidth = -1U;
4908 unsigned MaxWidth = 8;
4909 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4912 for (BasicBlock *BB : TheLoop->blocks()) {
4913 // For each instruction in the loop.
4914 for (Instruction &I : BB->instructionsWithoutDebug()) {
4915 Type *T = I.getType();
4917 // Skip ignored values.
4918 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4921 // Only examine Loads, Stores and PHINodes.
4922 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4925 // Examine PHI nodes that are reduction variables. Update the type to
4926 // account for the recurrence type.
4927 if (auto *PN = dyn_cast<PHINode>(&I)) {
4928 if (!Legal->isReductionVariable(PN))
4930 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4931 T = RdxDesc.getRecurrenceType();
4934 // Examine the stored values.
4935 if (auto *ST = dyn_cast<StoreInst>(&I))
4936 T = ST->getValueOperand()->getType();
4938 // Ignore loaded pointer types and stored pointer types that are not
4941 // FIXME: The check here attempts to predict whether a load or store will
4942 // be vectorized. We only know this for certain after a VF has
4943 // been selected. Here, we assume that if an access can be
4944 // vectorized, it will be. We should also look at extending this
4945 // optimization to non-pointer types.
4947 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
4948 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
4951 MinWidth = std::min(MinWidth,
4952 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4953 MaxWidth = std::max(MaxWidth,
4954 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4958 return {MinWidth, MaxWidth};
4961 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
4963 unsigned LoopCost) {
4964 // -- The interleave heuristics --
4965 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4966 // There are many micro-architectural considerations that we can't predict
4967 // at this level. For example, frontend pressure (on decode or fetch) due to
4968 // code size, or the number and capabilities of the execution ports.
4970 // We use the following heuristics to select the interleave count:
4971 // 1. If the code has reductions, then we interleave to break the cross
4972 // iteration dependency.
4973 // 2. If the loop is really small, then we interleave to reduce the loop
4975 // 3. We don't interleave if we think that we will spill registers to memory
4976 // due to the increased register pressure.
4978 // When we optimize for size, we don't interleave.
4982 // We used the distance for the interleave count.
4983 if (Legal->getMaxSafeDepDistBytes() != -1U)
4986 // Do not interleave loops with a relatively small trip count.
4987 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4988 if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
4991 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4992 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4996 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4997 TargetNumRegisters = ForceTargetNumScalarRegs;
4999 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5000 TargetNumRegisters = ForceTargetNumVectorRegs;
5003 RegisterUsage R = calculateRegisterUsage({VF})[0];
5004 // We divide by these constants so assume that we have at least one
5005 // instruction that uses at least one register.
5006 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5008 // We calculate the interleave count using the following formula.
5009 // Subtract the number of loop invariants from the number of available
5010 // registers. These registers are used by all of the interleaved instances.
5011 // Next, divide the remaining registers by the number of registers that is
5012 // required by the loop, in order to estimate how many parallel instances
5013 // fit without causing spills. All of this is rounded down if necessary to be
5014 // a power of two. We want power of two interleave count to simplify any
5015 // addressing operations or alignment considerations.
5016 // We also want power of two interleave counts to ensure that the induction
5017 // variable of the vector loop wraps to zero, when tail is folded by masking;
5018 // this currently happens when OptForSize, in which case IC is set to 1 above.
5019 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5022 // Don't count the induction variable as interleaved.
5023 if (EnableIndVarRegisterHeur)
5024 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5025 std::max(1U, (R.MaxLocalUsers - 1)));
5027 // Clamp the interleave ranges to reasonable counts.
5028 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5030 // Check if the user has overridden the max.
5032 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5033 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5035 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5036 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5039 // If we did not calculate the cost for VF (because the user selected the VF)
5040 // then we calculate the cost of VF here.
5042 LoopCost = expectedCost(VF).first;
5044 assert(LoopCost && "Non-zero loop cost expected");
5046 // Clamp the calculated IC to be between the 1 and the max interleave count
5047 // that the target allows.
5048 if (IC > MaxInterleaveCount)
5049 IC = MaxInterleaveCount;
5053 // Interleave if we vectorized this loop and there is a reduction that could
5054 // benefit from interleaving.
5055 if (VF > 1 && !Legal->getReductionVars()->empty()) {
5056 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5060 // Note that if we've already vectorized the loop we will have done the
5061 // runtime check and so interleaving won't require further checks.
5062 bool InterleavingRequiresRuntimePointerCheck =
5063 (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5065 // We want to interleave small loops in order to reduce the loop overhead and
5066 // potentially expose ILP opportunities.
5067 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5068 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5069 // We assume that the cost overhead is 1 and we use the cost model
5070 // to estimate the cost of the loop and interleave until the cost of the
5071 // loop overhead is about 5% of the cost of the loop.
5073 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5075 // Interleave until store/load ports (estimated by max interleave count) are
5077 unsigned NumStores = Legal->getNumStores();
5078 unsigned NumLoads = Legal->getNumLoads();
5079 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5080 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5082 // If we have a scalar reduction (vector reductions are already dealt with
5083 // by this point), we can increase the critical path length if the loop
5084 // we're interleaving is inside another loop. Limit, by default to 2, so the
5085 // critical path only gets increased by one reduction operation.
5086 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5087 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5088 SmallIC = std::min(SmallIC, F);
5089 StoresIC = std::min(StoresIC, F);
5090 LoadsIC = std::min(LoadsIC, F);
5093 if (EnableLoadStoreRuntimeInterleave &&
5094 std::max(StoresIC, LoadsIC) > SmallIC) {
5096 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5097 return std::max(StoresIC, LoadsIC);
5100 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5104 // Interleave if this is a large loop (small loops are already dealt with by
5105 // this point) that could benefit from interleaving.
5106 bool HasReductions = !Legal->getReductionVars()->empty();
5107 if (TTI.enableAggressiveInterleaving(HasReductions)) {
5108 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5112 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5116 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5117 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5118 // This function calculates the register usage by measuring the highest number
5119 // of values that are alive at a single location. Obviously, this is a very
5120 // rough estimation. We scan the loop in a topological order in order and
5121 // assign a number to each instruction. We use RPO to ensure that defs are
5122 // met before their users. We assume that each instruction that has in-loop
5123 // users starts an interval. We record every time that an in-loop value is
5124 // used, so we have a list of the first and last occurrences of each
5125 // instruction. Next, we transpose this data structure into a multi map that
5126 // holds the list of intervals that *end* at a specific location. This multi
5127 // map allows us to perform a linear search. We scan the instructions linearly
5128 // and record each time that a new interval starts, by placing it in a set.
5129 // If we find this value in the multi-map then we remove it from the set.
5130 // The max register usage is the maximum size of the set.
5131 // We also search for instructions that are defined outside the loop, but are
5132 // used inside the loop. We need this number separately from the max-interval
5133 // usage number because when we unroll, loop-invariant values do not take
5135 LoopBlocksDFS DFS(TheLoop);
5140 // Each 'key' in the map opens a new interval. The values
5141 // of the map are the index of the 'last seen' usage of the
5142 // instruction that is the key.
5143 using IntervalMap = DenseMap<Instruction *, unsigned>;
5145 // Maps instruction to its index.
5146 SmallVector<Instruction *, 64> IdxToInstr;
5147 // Marks the end of each interval.
5148 IntervalMap EndPoint;
5149 // Saves the list of instruction indices that are used in the loop.
5150 SmallPtrSet<Instruction *, 8> Ends;
5151 // Saves the list of values that are used in the loop but are
5152 // defined outside the loop, such as arguments and constants.
5153 SmallPtrSet<Value *, 8> LoopInvariants;
5155 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5156 for (Instruction &I : BB->instructionsWithoutDebug()) {
5157 IdxToInstr.push_back(&I);
5159 // Save the end location of each USE.
5160 for (Value *U : I.operands()) {
5161 auto *Instr = dyn_cast<Instruction>(U);
5163 // Ignore non-instruction values such as arguments, constants, etc.
5167 // If this instruction is outside the loop then record it and continue.
5168 if (!TheLoop->contains(Instr)) {
5169 LoopInvariants.insert(Instr);
5173 // Overwrite previous end points.
5174 EndPoint[Instr] = IdxToInstr.size();
5180 // Saves the list of intervals that end with the index in 'key'.
5181 using InstrList = SmallVector<Instruction *, 2>;
5182 DenseMap<unsigned, InstrList> TransposeEnds;
5184 // Transpose the EndPoints to a list of values that end at each index.
5185 for (auto &Interval : EndPoint)
5186 TransposeEnds[Interval.second].push_back(Interval.first);
5188 SmallPtrSet<Instruction *, 8> OpenIntervals;
5190 // Get the size of the widest register.
5191 unsigned MaxSafeDepDist = -1U;
5192 if (Legal->getMaxSafeDepDistBytes() != -1U)
5193 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5194 unsigned WidestRegister =
5195 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5196 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5198 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5199 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5201 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5203 // A lambda that gets the register usage for the given type and VF.
5204 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5205 if (Ty->isTokenTy())
5207 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5208 return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5211 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5212 Instruction *I = IdxToInstr[i];
5214 // Remove all of the instructions that end at this location.
5215 InstrList &List = TransposeEnds[i];
5216 for (Instruction *ToRemove : List)
5217 OpenIntervals.erase(ToRemove);
5219 // Ignore instructions that are never used within the loop.
5220 if (Ends.find(I) == Ends.end())
5223 // Skip ignored values.
5224 if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5227 // For each VF find the maximum usage of registers.
5228 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5230 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5233 collectUniformsAndScalars(VFs[j]);
5234 // Count the number of live intervals.
5235 unsigned RegUsage = 0;
5236 for (auto Inst : OpenIntervals) {
5237 // Skip ignored values for VF > 1.
5238 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5239 isScalarAfterVectorization(Inst, VFs[j]))
5241 RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5243 MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5246 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5247 << OpenIntervals.size() << '\n');
5249 // Add the current instruction to the list of open intervals.
5250 OpenIntervals.insert(I);
5253 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5254 unsigned Invariant = 0;
5256 Invariant = LoopInvariants.size();
5258 for (auto Inst : LoopInvariants)
5259 Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5262 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5263 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5264 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5267 RU.LoopInvariantRegs = Invariant;
5268 RU.MaxLocalUsers = MaxUsages[i];
5275 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5276 // TODO: Cost model for emulated masked load/store is completely
5277 // broken. This hack guides the cost model to use an artificially
5278 // high enough value to practically disable vectorization with such
5279 // operations, except where previously deployed legality hack allowed
5280 // using very low cost values. This is to avoid regressions coming simply
5281 // from moving "masked load/store" check from legality to cost model.
5282 // Masked Load/Gather emulation was previously never allowed.
5283 // Limited number of Masked Store/Scatter emulation was allowed.
5284 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5285 return isa<LoadInst>(I) ||
5286 (isa<StoreInst>(I) &&
5287 NumPredStores > NumberOfStoresToPredicate);
5290 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5291 // If we aren't vectorizing the loop, or if we've already collected the
5292 // instructions to scalarize, there's nothing to do. Collection may already
5293 // have occurred if we have a user-selected VF and are now computing the
5294 // expected cost for interleaving.
5295 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5298 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5299 // not profitable to scalarize any instructions, the presence of VF in the
5300 // map will indicate that we've analyzed it already.
5301 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5303 // Find all the instructions that are scalar with predication in the loop and
5304 // determine if it would be better to not if-convert the blocks they are in.
5305 // If so, we also record the instructions to scalarize.
5306 for (BasicBlock *BB : TheLoop->blocks()) {
5307 if (!blockNeedsPredication(BB))
5309 for (Instruction &I : *BB)
5310 if (isScalarWithPredication(&I)) {
5311 ScalarCostsTy ScalarCosts;
5312 // Do not apply discount logic if hacked cost is needed
5313 // for emulated masked memrefs.
5314 if (!useEmulatedMaskMemRefHack(&I) &&
5315 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5316 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5317 // Remember that BB will remain after vectorization.
5318 PredicatedBBsAfterVectorization.insert(BB);
5323 int LoopVectorizationCostModel::computePredInstDiscount(
5324 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5326 assert(!isUniformAfterVectorization(PredInst, VF) &&
5327 "Instruction marked uniform-after-vectorization will be predicated");
5329 // Initialize the discount to zero, meaning that the scalar version and the
5330 // vector version cost the same.
5333 // Holds instructions to analyze. The instructions we visit are mapped in
5334 // ScalarCosts. Those instructions are the ones that would be scalarized if
5335 // we find that the scalar version costs less.
5336 SmallVector<Instruction *, 8> Worklist;
5338 // Returns true if the given instruction can be scalarized.
5339 auto canBeScalarized = [&](Instruction *I) -> bool {
5340 // We only attempt to scalarize instructions forming a single-use chain
5341 // from the original predicated block that would otherwise be vectorized.
5342 // Although not strictly necessary, we give up on instructions we know will
5343 // already be scalar to avoid traversing chains that are unlikely to be
5345 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5346 isScalarAfterVectorization(I, VF))
5349 // If the instruction is scalar with predication, it will be analyzed
5350 // separately. We ignore it within the context of PredInst.
5351 if (isScalarWithPredication(I))
5354 // If any of the instruction's operands are uniform after vectorization,
5355 // the instruction cannot be scalarized. This prevents, for example, a
5356 // masked load from being scalarized.
5358 // We assume we will only emit a value for lane zero of an instruction
5359 // marked uniform after vectorization, rather than VF identical values.
5360 // Thus, if we scalarize an instruction that uses a uniform, we would
5361 // create uses of values corresponding to the lanes we aren't emitting code
5362 // for. This behavior can be changed by allowing getScalarValue to clone
5363 // the lane zero values for uniforms rather than asserting.
5364 for (Use &U : I->operands())
5365 if (auto *J = dyn_cast<Instruction>(U.get()))
5366 if (isUniformAfterVectorization(J, VF))
5369 // Otherwise, we can scalarize the instruction.
5373 // Compute the expected cost discount from scalarizing the entire expression
5374 // feeding the predicated instruction. We currently only consider expressions
5375 // that are single-use instruction chains.
5376 Worklist.push_back(PredInst);
5377 while (!Worklist.empty()) {
5378 Instruction *I = Worklist.pop_back_val();
5380 // If we've already analyzed the instruction, there's nothing to do.
5381 if (ScalarCosts.find(I) != ScalarCosts.end())
5384 // Compute the cost of the vector instruction. Note that this cost already
5385 // includes the scalarization overhead of the predicated instruction.
5386 unsigned VectorCost = getInstructionCost(I, VF).first;
5388 // Compute the cost of the scalarized instruction. This cost is the cost of
5389 // the instruction as if it wasn't if-converted and instead remained in the
5390 // predicated block. We will scale this cost by block probability after
5391 // computing the scalarization overhead.
5392 unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5394 // Compute the scalarization overhead of needed insertelement instructions
5396 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5397 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5399 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5402 // Compute the scalarization overhead of needed extractelement
5403 // instructions. For each of the instruction's operands, if the operand can
5404 // be scalarized, add it to the worklist; otherwise, account for the
5406 for (Use &U : I->operands())
5407 if (auto *J = dyn_cast<Instruction>(U.get())) {
5408 assert(VectorType::isValidElementType(J->getType()) &&
5409 "Instruction has non-scalar type");
5410 if (canBeScalarized(J))
5411 Worklist.push_back(J);
5412 else if (needsExtract(J, VF))
5413 ScalarCost += TTI.getScalarizationOverhead(
5414 ToVectorTy(J->getType(),VF), false, true);
5417 // Scale the total scalar cost by block probability.
5418 ScalarCost /= getReciprocalPredBlockProb();
5420 // Compute the discount. A non-negative discount means the vector version
5421 // of the instruction costs more, and scalarizing would be beneficial.
5422 Discount += VectorCost - ScalarCost;
5423 ScalarCosts[I] = ScalarCost;
5429 LoopVectorizationCostModel::VectorizationCostTy
5430 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5431 VectorizationCostTy Cost;
5434 for (BasicBlock *BB : TheLoop->blocks()) {
5435 VectorizationCostTy BlockCost;
5437 // For each instruction in the old loop.
5438 for (Instruction &I : BB->instructionsWithoutDebug()) {
5439 // Skip ignored values.
5440 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5441 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5444 VectorizationCostTy C = getInstructionCost(&I, VF);
5446 // Check if we should override the cost.
5447 if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5448 C.first = ForceTargetInstructionCost;
5450 BlockCost.first += C.first;
5451 BlockCost.second |= C.second;
5452 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5453 << " for VF " << VF << " For instruction: " << I
5457 // If we are vectorizing a predicated block, it will have been
5458 // if-converted. This means that the block's instructions (aside from
5459 // stores and instructions that may divide by zero) will now be
5460 // unconditionally executed. For the scalar case, we may not always execute
5461 // the predicated block. Thus, scale the block's cost by the probability of
5463 if (VF == 1 && blockNeedsPredication(BB))
5464 BlockCost.first /= getReciprocalPredBlockProb();
5466 Cost.first += BlockCost.first;
5467 Cost.second |= BlockCost.second;
5473 /// Gets Address Access SCEV after verifying that the access pattern
5474 /// is loop invariant except the induction variable dependence.
5476 /// This SCEV can be sent to the Target in order to estimate the address
5477 /// calculation cost.
5478 static const SCEV *getAddressAccessSCEV(
5480 LoopVectorizationLegality *Legal,
5481 PredicatedScalarEvolution &PSE,
5482 const Loop *TheLoop) {
5484 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5488 // We are looking for a gep with all loop invariant indices except for one
5489 // which should be an induction variable.
5490 auto SE = PSE.getSE();
5491 unsigned NumOperands = Gep->getNumOperands();
5492 for (unsigned i = 1; i < NumOperands; ++i) {
5493 Value *Opd = Gep->getOperand(i);
5494 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5495 !Legal->isInductionVariable(Opd))
5499 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5500 return PSE.getSCEV(Ptr);
5503 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5504 return Legal->hasStride(I->getOperand(0)) ||
5505 Legal->hasStride(I->getOperand(1));
5508 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5510 assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5511 Type *ValTy = getMemInstValueType(I);
5512 auto SE = PSE.getSE();
5514 unsigned Alignment = getLoadStoreAlignment(I);
5515 unsigned AS = getLoadStoreAddressSpace(I);
5516 Value *Ptr = getLoadStorePointerOperand(I);
5517 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5519 // Figure out whether the access is strided and get the stride value
5520 // if it's known in compile time
5521 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5523 // Get the cost of the scalar memory instruction and address computation.
5524 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5526 // Don't pass *I here, since it is scalar but will actually be part of a
5527 // vectorized loop where the user of it is a vectorized instruction.
5529 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5532 // Get the overhead of the extractelement and insertelement instructions
5533 // we might create due to scalarization.
5534 Cost += getScalarizationOverhead(I, VF);
5536 // If we have a predicated store, it may not be executed for each vector
5537 // lane. Scale the cost by the probability of executing the predicated
5539 if (isPredicatedInst(I)) {
5540 Cost /= getReciprocalPredBlockProb();
5542 if (useEmulatedMaskMemRefHack(I))
5543 // Artificially setting to a high enough value to practically disable
5544 // vectorization with such operations.
5551 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5553 Type *ValTy = getMemInstValueType(I);
5554 Type *VectorTy = ToVectorTy(ValTy, VF);
5555 unsigned Alignment = getLoadStoreAlignment(I);
5556 Value *Ptr = getLoadStorePointerOperand(I);
5557 unsigned AS = getLoadStoreAddressSpace(I);
5558 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5560 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5561 "Stride should be 1 or -1 for consecutive memory access");
5563 if (Legal->isMaskRequired(I))
5564 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5566 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5568 bool Reverse = ConsecutiveStride < 0;
5570 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5574 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5576 Type *ValTy = getMemInstValueType(I);
5577 Type *VectorTy = ToVectorTy(ValTy, VF);
5578 unsigned Alignment = getLoadStoreAlignment(I);
5579 unsigned AS = getLoadStoreAddressSpace(I);
5580 if (isa<LoadInst>(I)) {
5581 return TTI.getAddressComputationCost(ValTy) +
5582 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5583 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5585 StoreInst *SI = cast<StoreInst>(I);
5587 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5588 return TTI.getAddressComputationCost(ValTy) +
5589 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5590 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5591 Instruction::ExtractElement,
5595 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5597 Type *ValTy = getMemInstValueType(I);
5598 Type *VectorTy = ToVectorTy(ValTy, VF);
5599 unsigned Alignment = getLoadStoreAlignment(I);
5600 Value *Ptr = getLoadStorePointerOperand(I);
5602 return TTI.getAddressComputationCost(VectorTy) +
5603 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5604 Legal->isMaskRequired(I), Alignment);
5607 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5609 Type *ValTy = getMemInstValueType(I);
5610 Type *VectorTy = ToVectorTy(ValTy, VF);
5611 unsigned AS = getLoadStoreAddressSpace(I);
5613 auto Group = getInterleavedAccessGroup(I);
5614 assert(Group && "Fail to get an interleaved access group.");
5616 unsigned InterleaveFactor = Group->getFactor();
5617 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5619 // Holds the indices of existing members in an interleaved load group.
5620 // An interleaved store group doesn't need this as it doesn't allow gaps.
5621 SmallVector<unsigned, 4> Indices;
5622 if (isa<LoadInst>(I)) {
5623 for (unsigned i = 0; i < InterleaveFactor; i++)
5624 if (Group->getMember(i))
5625 Indices.push_back(i);
5628 // Calculate the cost of the whole interleaved group.
5629 bool UseMaskForGaps =
5630 Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
5631 unsigned Cost = TTI.getInterleavedMemoryOpCost(
5632 I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5633 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5635 if (Group->isReverse()) {
5636 // TODO: Add support for reversed masked interleaved access.
5637 assert(!Legal->isMaskRequired(I) &&
5638 "Reverse masked interleaved access not supported.");
5639 Cost += Group->getNumMembers() *
5640 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5645 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5647 // Calculate scalar cost only. Vectorization cost should be ready at this
5650 Type *ValTy = getMemInstValueType(I);
5651 unsigned Alignment = getLoadStoreAlignment(I);
5652 unsigned AS = getLoadStoreAddressSpace(I);
5654 return TTI.getAddressComputationCost(ValTy) +
5655 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5657 return getWideningCost(I, VF);
5660 LoopVectorizationCostModel::VectorizationCostTy
5661 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5662 // If we know that this instruction will remain uniform, check the cost of
5663 // the scalar version.
5664 if (isUniformAfterVectorization(I, VF))
5667 if (VF > 1 && isProfitableToScalarize(I, VF))
5668 return VectorizationCostTy(InstsToScalarize[VF][I], false);
5670 // Forced scalars do not have any scalarization overhead.
5671 auto ForcedScalar = ForcedScalars.find(VF);
5672 if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5673 auto InstSet = ForcedScalar->second;
5674 if (InstSet.find(I) != InstSet.end())
5675 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5679 unsigned C = getInstructionCost(I, VF, VectorTy);
5681 bool TypeNotScalarized =
5682 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5683 return VectorizationCostTy(C, TypeNotScalarized);
5686 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5693 Type *RetTy = ToVectorTy(I->getType(), VF);
5694 if (!RetTy->isVoidTy() &&
5695 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5696 Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5698 // Some targets keep addresses scalar.
5699 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5702 // Some targets support efficient element stores.
5703 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5706 // Collect operands to consider.
5707 CallInst *CI = dyn_cast<CallInst>(I);
5708 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5710 // Skip operands that do not require extraction/scalarization and do not incur
5712 return Cost + TTI.getOperandsScalarizationOverhead(
5713 filterExtractingOperands(Ops, VF), VF);
5716 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5720 for (BasicBlock *BB : TheLoop->blocks()) {
5721 // For each instruction in the old loop.
5722 for (Instruction &I : *BB) {
5723 Value *Ptr = getLoadStorePointerOperand(&I);
5727 // TODO: We should generate better code and update the cost model for
5728 // predicated uniform stores. Today they are treated as any other
5729 // predicated store (see added test cases in
5730 // invariant-store-vectorization.ll).
5731 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5734 if (Legal->isUniform(Ptr) &&
5735 // Conditional loads and stores should be scalarized and predicated.
5736 // isScalarWithPredication cannot be used here since masked
5737 // gather/scatters are not considered scalar with predication.
5738 !Legal->blockNeedsPredication(I.getParent())) {
5739 // TODO: Avoid replicating loads and stores instead of
5740 // relying on instcombine to remove them.
5741 // Load: Scalar load + broadcast
5742 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5743 unsigned Cost = getUniformMemOpCost(&I, VF);
5744 setWideningDecision(&I, VF, CM_Scalarize, Cost);
5748 // We assume that widening is the best solution when possible.
5749 if (memoryInstructionCanBeWidened(&I, VF)) {
5750 unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5751 int ConsecutiveStride =
5752 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5753 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5754 "Expected consecutive stride.");
5755 InstWidening Decision =
5756 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5757 setWideningDecision(&I, VF, Decision, Cost);
5761 // Choose between Interleaving, Gather/Scatter or Scalarization.
5762 unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5763 unsigned NumAccesses = 1;
5764 if (isAccessInterleaved(&I)) {
5765 auto Group = getInterleavedAccessGroup(&I);
5766 assert(Group && "Fail to get an interleaved access group.");
5768 // Make one decision for the whole group.
5769 if (getWideningDecision(&I, VF) != CM_Unknown)
5772 NumAccesses = Group->getNumMembers();
5773 if (interleavedAccessCanBeWidened(&I, VF))
5774 InterleaveCost = getInterleaveGroupCost(&I, VF);
5777 unsigned GatherScatterCost =
5778 isLegalGatherOrScatter(&I)
5779 ? getGatherScatterCost(&I, VF) * NumAccesses
5780 : std::numeric_limits<unsigned>::max();
5782 unsigned ScalarizationCost =
5783 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5785 // Choose better solution for the current VF,
5786 // write down this decision and use it during vectorization.
5788 InstWidening Decision;
5789 if (InterleaveCost <= GatherScatterCost &&
5790 InterleaveCost < ScalarizationCost) {
5791 Decision = CM_Interleave;
5792 Cost = InterleaveCost;
5793 } else if (GatherScatterCost < ScalarizationCost) {
5794 Decision = CM_GatherScatter;
5795 Cost = GatherScatterCost;
5797 Decision = CM_Scalarize;
5798 Cost = ScalarizationCost;
5800 // If the instructions belongs to an interleave group, the whole group
5801 // receives the same decision. The whole group receives the cost, but
5802 // the cost will actually be assigned to one instruction.
5803 if (auto Group = getInterleavedAccessGroup(&I))
5804 setWideningDecision(Group, VF, Decision, Cost);
5806 setWideningDecision(&I, VF, Decision, Cost);
5810 // Make sure that any load of address and any other address computation
5811 // remains scalar unless there is gather/scatter support. This avoids
5812 // inevitable extracts into address registers, and also has the benefit of
5813 // activating LSR more, since that pass can't optimize vectorized
5815 if (TTI.prefersVectorizedAddressing())
5818 // Start with all scalar pointer uses.
5819 SmallPtrSet<Instruction *, 8> AddrDefs;
5820 for (BasicBlock *BB : TheLoop->blocks())
5821 for (Instruction &I : *BB) {
5822 Instruction *PtrDef =
5823 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5824 if (PtrDef && TheLoop->contains(PtrDef) &&
5825 getWideningDecision(&I, VF) != CM_GatherScatter)
5826 AddrDefs.insert(PtrDef);
5829 // Add all instructions used to generate the addresses.
5830 SmallVector<Instruction *, 4> Worklist;
5831 for (auto *I : AddrDefs)
5832 Worklist.push_back(I);
5833 while (!Worklist.empty()) {
5834 Instruction *I = Worklist.pop_back_val();
5835 for (auto &Op : I->operands())
5836 if (auto *InstOp = dyn_cast<Instruction>(Op))
5837 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5838 AddrDefs.insert(InstOp).second)
5839 Worklist.push_back(InstOp);
5842 for (auto *I : AddrDefs) {
5843 if (isa<LoadInst>(I)) {
5844 // Setting the desired widening decision should ideally be handled in
5845 // by cost functions, but since this involves the task of finding out
5846 // if the loaded register is involved in an address computation, it is
5847 // instead changed here when we know this is the case.
5848 InstWidening Decision = getWideningDecision(I, VF);
5849 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5850 // Scalarize a widened load of address.
5851 setWideningDecision(I, VF, CM_Scalarize,
5852 (VF * getMemoryInstructionCost(I, 1)));
5853 else if (auto Group = getInterleavedAccessGroup(I)) {
5854 // Scalarize an interleave group of address loads.
5855 for (unsigned I = 0; I < Group->getFactor(); ++I) {
5856 if (Instruction *Member = Group->getMember(I))
5857 setWideningDecision(Member, VF, CM_Scalarize,
5858 (VF * getMemoryInstructionCost(Member, 1)));
5862 // Make sure I gets scalarized and a cost estimate without
5863 // scalarization overhead.
5864 ForcedScalars[VF].insert(I);
5868 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5871 Type *RetTy = I->getType();
5872 if (canTruncateToMinimalBitwidth(I, VF))
5873 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5874 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5875 auto SE = PSE.getSE();
5877 // TODO: We need to estimate the cost of intrinsic calls.
5878 switch (I->getOpcode()) {
5879 case Instruction::GetElementPtr:
5880 // We mark this instruction as zero-cost because the cost of GEPs in
5881 // vectorized code depends on whether the corresponding memory instruction
5882 // is scalarized or not. Therefore, we handle GEPs with the memory
5883 // instruction cost.
5885 case Instruction::Br: {
5886 // In cases of scalarized and predicated instructions, there will be VF
5887 // predicated blocks in the vectorized loop. Each branch around these
5888 // blocks requires also an extract of its vector compare i1 element.
5889 bool ScalarPredicatedBB = false;
5890 BranchInst *BI = cast<BranchInst>(I);
5891 if (VF > 1 && BI->isConditional() &&
5892 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
5893 PredicatedBBsAfterVectorization.end() ||
5894 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
5895 PredicatedBBsAfterVectorization.end()))
5896 ScalarPredicatedBB = true;
5898 if (ScalarPredicatedBB) {
5899 // Return cost for branches around scalarized and predicated blocks.
5901 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
5902 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
5903 (TTI.getCFInstrCost(Instruction::Br) * VF));
5904 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
5905 // The back-edge branch will remain, as will all scalar branches.
5906 return TTI.getCFInstrCost(Instruction::Br);
5908 // This branch will be eliminated by if-conversion.
5910 // Note: We currently assume zero cost for an unconditional branch inside
5911 // a predicated block since it will become a fall-through, although we
5912 // may decide in the future to call TTI for all branches.
5914 case Instruction::PHI: {
5915 auto *Phi = cast<PHINode>(I);
5917 // First-order recurrences are replaced by vector shuffles inside the loop.
5918 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
5919 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
5920 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
5921 VectorTy, VF - 1, VectorType::get(RetTy, 1));
5923 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
5924 // converted into select instructions. We require N - 1 selects per phi
5925 // node, where N is the number of incoming values.
5926 if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
5927 return (Phi->getNumIncomingValues() - 1) *
5928 TTI.getCmpSelInstrCost(
5929 Instruction::Select, ToVectorTy(Phi->getType(), VF),
5930 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
5932 return TTI.getCFInstrCost(Instruction::PHI);
5934 case Instruction::UDiv:
5935 case Instruction::SDiv:
5936 case Instruction::URem:
5937 case Instruction::SRem:
5938 // If we have a predicated instruction, it may not be executed for each
5939 // vector lane. Get the scalarization cost and scale this amount by the
5940 // probability of executing the predicated block. If the instruction is not
5941 // predicated, we fall through to the next case.
5942 if (VF > 1 && isScalarWithPredication(I)) {
5945 // These instructions have a non-void type, so account for the phi nodes
5946 // that we will create. This cost is likely to be zero. The phi node
5947 // cost, if any, should be scaled by the block probability because it
5948 // models a copy at the end of each predicated block.
5949 Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
5951 // The cost of the non-predicated instruction.
5952 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
5954 // The cost of insertelement and extractelement instructions needed for
5956 Cost += getScalarizationOverhead(I, VF);
5958 // Scale the cost by the probability of executing the predicated blocks.
5959 // This assumes the predicated block for each vector lane is equally
5961 return Cost / getReciprocalPredBlockProb();
5964 case Instruction::Add:
5965 case Instruction::FAdd:
5966 case Instruction::Sub:
5967 case Instruction::FSub:
5968 case Instruction::Mul:
5969 case Instruction::FMul:
5970 case Instruction::FDiv:
5971 case Instruction::FRem:
5972 case Instruction::Shl:
5973 case Instruction::LShr:
5974 case Instruction::AShr:
5975 case Instruction::And:
5976 case Instruction::Or:
5977 case Instruction::Xor: {
5978 // Since we will replace the stride by 1 the multiplication should go away.
5979 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
5981 // Certain instructions can be cheaper to vectorize if they have a constant
5982 // second vector operand. One example of this are shifts on x86.
5983 Value *Op2 = I->getOperand(1);
5984 TargetTransformInfo::OperandValueProperties Op2VP;
5985 TargetTransformInfo::OperandValueKind Op2VK =
5986 TTI.getOperandInfo(Op2, Op2VP);
5987 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
5988 Op2VK = TargetTransformInfo::OK_UniformValue;
5990 SmallVector<const Value *, 4> Operands(I->operand_values());
5991 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
5992 return N * TTI.getArithmeticInstrCost(
5993 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
5994 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
5996 case Instruction::FNeg: {
5997 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
5998 return N * TTI.getArithmeticInstrCost(
5999 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6000 TargetTransformInfo::OK_AnyValue,
6001 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6004 case Instruction::Select: {
6005 SelectInst *SI = cast<SelectInst>(I);
6006 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6007 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6008 Type *CondTy = SI->getCondition()->getType();
6010 CondTy = VectorType::get(CondTy, VF);
6012 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6014 case Instruction::ICmp:
6015 case Instruction::FCmp: {
6016 Type *ValTy = I->getOperand(0)->getType();
6017 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6018 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6019 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6020 VectorTy = ToVectorTy(ValTy, VF);
6021 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6023 case Instruction::Store:
6024 case Instruction::Load: {
6025 unsigned Width = VF;
6027 InstWidening Decision = getWideningDecision(I, Width);
6028 assert(Decision != CM_Unknown &&
6029 "CM decision should be taken at this point");
6030 if (Decision == CM_Scalarize)
6033 VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6034 return getMemoryInstructionCost(I, VF);
6036 case Instruction::ZExt:
6037 case Instruction::SExt:
6038 case Instruction::FPToUI:
6039 case Instruction::FPToSI:
6040 case Instruction::FPExt:
6041 case Instruction::PtrToInt:
6042 case Instruction::IntToPtr:
6043 case Instruction::SIToFP:
6044 case Instruction::UIToFP:
6045 case Instruction::Trunc:
6046 case Instruction::FPTrunc:
6047 case Instruction::BitCast: {
6048 // We optimize the truncation of induction variables having constant
6049 // integer steps. The cost of these truncations is the same as the scalar
6051 if (isOptimizableIVTruncate(I, VF)) {
6052 auto *Trunc = cast<TruncInst>(I);
6053 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6054 Trunc->getSrcTy(), Trunc);
6057 Type *SrcScalarTy = I->getOperand(0)->getType();
6059 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6060 if (canTruncateToMinimalBitwidth(I, VF)) {
6061 // This cast is going to be shrunk. This may remove the cast or it might
6062 // turn it into slightly different cast. For example, if MinBW == 16,
6063 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6065 // Calculate the modified src and dest types.
6066 Type *MinVecTy = VectorTy;
6067 if (I->getOpcode() == Instruction::Trunc) {
6068 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6070 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6071 } else if (I->getOpcode() == Instruction::ZExt ||
6072 I->getOpcode() == Instruction::SExt) {
6073 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6075 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6079 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6080 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6082 case Instruction::Call: {
6083 bool NeedToScalarize;
6084 CallInst *CI = cast<CallInst>(I);
6085 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6086 if (getVectorIntrinsicIDForCall(CI, TLI))
6087 return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6091 // The cost of executing VF copies of the scalar instruction. This opcode
6092 // is unknown. Assume that it is the same as 'mul'.
6093 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6094 getScalarizationOverhead(I, VF);
6098 char LoopVectorize::ID = 0;
6100 static const char lv_name[] = "Loop Vectorization";
6102 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6103 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6104 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6105 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6106 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6107 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6108 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6109 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6110 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6111 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6112 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6113 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6114 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6115 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6116 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6120 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6122 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6123 bool VectorizeOnlyWhenForced) {
6124 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6127 } // end namespace llvm
6129 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6130 // Check if the pointer operand of a load or store instruction is
6132 if (auto *Ptr = getLoadStorePointerOperand(Inst))
6133 return Legal->isConsecutivePtr(Ptr);
6137 void LoopVectorizationCostModel::collectValuesToIgnore() {
6138 // Ignore ephemeral values.
6139 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6141 // Ignore type-promoting instructions we identified during reduction
6143 for (auto &Reduction : *Legal->getReductionVars()) {
6144 RecurrenceDescriptor &RedDes = Reduction.second;
6145 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6146 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6148 // Ignore type-casting instructions we identified during induction
6150 for (auto &Induction : *Legal->getInductionVars()) {
6151 InductionDescriptor &IndDes = Induction.second;
6152 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6153 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6157 // TODO: we could return a pair of values that specify the max VF and
6158 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6159 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6160 // doesn't have a cost model that can choose which plan to execute if
6161 // more than one is generated.
6162 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6163 LoopVectorizationCostModel &CM) {
6164 unsigned WidestType;
6165 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6166 return WidestVectorRegBits / WidestType;
6170 LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
6172 unsigned VF = UserVF;
6173 // Outer loop handling: They may require CFG and instruction level
6174 // transformations before even evaluating whether vectorization is profitable.
6175 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6176 // the vectorization pipeline.
6177 if (!OrigLoop->empty()) {
6178 // If the user doesn't provide a vectorization factor, determine a
6181 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6182 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6184 // Make sure we have a VF > 1 for stress testing.
6185 if (VPlanBuildStressTest && VF < 2) {
6186 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6187 << "overriding computed VF.\n");
6191 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6192 assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6193 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6194 << " to build VPlans.\n");
6195 buildVPlans(VF, VF);
6197 // For VPlan build stress testing, we bail out after VPlan construction.
6198 if (VPlanBuildStressTest)
6199 return VectorizationFactor::Disabled();
6205 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6206 "VPlan-native path.\n");
6207 return VectorizationFactor::Disabled();
6210 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
6212 assert(OrigLoop->empty() && "Inner loop expected.");
6213 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
6214 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6217 // Invalidate interleave groups if all blocks of loop will be predicated.
6218 if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6219 !useMaskedInterleavedAccesses(*TTI)) {
6222 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6223 "which requires masked-interleaved support.\n");
6224 CM.InterleaveInfo.reset();
6228 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6229 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6230 // Collect the instructions (and their associated costs) that will be more
6231 // profitable to scalarize.
6232 CM.selectUserVectorizationFactor(UserVF);
6233 buildVPlansWithVPRecipes(UserVF, UserVF);
6234 LLVM_DEBUG(printPlans(dbgs()));
6235 return {{UserVF, 0}};
6238 unsigned MaxVF = MaybeMaxVF.getValue();
6239 assert(MaxVF != 0 && "MaxVF is zero.");
6241 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6242 // Collect Uniform and Scalar instructions after vectorization with VF.
6243 CM.collectUniformsAndScalars(VF);
6245 // Collect the instructions (and their associated costs) that will be more
6246 // profitable to scalarize.
6248 CM.collectInstsToScalarize(VF);
6251 buildVPlansWithVPRecipes(1, MaxVF);
6252 LLVM_DEBUG(printPlans(dbgs()));
6254 return VectorizationFactor::Disabled();
6256 // Select the optimal vectorization factor.
6257 return CM.selectVectorizationFactor(MaxVF);
6260 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6261 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6266 erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6267 return !Plan->hasVF(VF);
6269 assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6272 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6273 DominatorTree *DT) {
6274 // Perform the actual loop transformation.
6276 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6277 VPCallbackILV CallbackILV(ILV);
6279 VPTransformState State{BestVF, BestUF, LI,
6280 DT, ILV.Builder, ILV.VectorLoopValueMap,
6282 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6283 State.TripCount = ILV.getOrCreateTripCount(nullptr);
6285 //===------------------------------------------------===//
6287 // Notice: any optimization or new instruction that go
6288 // into the code below should also be implemented in
6291 //===------------------------------------------------===//
6293 // 2. Copy and widen instructions from the old loop into the new loop.
6294 assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6295 VPlans.front()->execute(&State);
6297 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6298 // predication, updating analyses.
6299 ILV.fixVectorizedLoop();
6302 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6303 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6304 BasicBlock *Latch = OrigLoop->getLoopLatch();
6306 // We create new control-flow for the vectorized loop, so the original
6307 // condition will be dead after vectorization if it's only used by the
6309 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6310 if (Cmp && Cmp->hasOneUse())
6311 DeadInstructions.insert(Cmp);
6313 // We create new "steps" for induction variable updates to which the original
6314 // induction variables map. An original update instruction will be dead if
6315 // all its users except the induction variable are dead.
6316 for (auto &Induction : *Legal->getInductionVars()) {
6317 PHINode *Ind = Induction.first;
6318 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6319 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6320 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6321 DeadInstructions.end();
6323 DeadInstructions.insert(IndUpdate);
6325 // We record as "Dead" also the type-casting instructions we had identified
6326 // during induction analysis. We don't need any handling for them in the
6327 // vectorized loop because we have proven that, under a proper runtime
6328 // test guarding the vectorized loop, the value of the phi, and the casted
6329 // value of the phi, are the same. The last instruction in this casting chain
6330 // will get its scalar/vector/widened def from the scalar/vector/widened def
6331 // of the respective phi node. Any other casts in the induction def-use chain
6332 // have no other uses outside the phi update chain, and will be ignored.
6333 InductionDescriptor &IndDes = Induction.second;
6334 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6335 DeadInstructions.insert(Casts.begin(), Casts.end());
6339 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6341 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6343 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6344 Instruction::BinaryOps BinOp) {
6345 // When unrolling and the VF is 1, we only need to add a simple scalar.
6346 Type *Ty = Val->getType();
6347 assert(!Ty->isVectorTy() && "Val must be a scalar");
6349 if (Ty->isFloatingPointTy()) {
6350 Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6352 // Floating point operations had to be 'fast' to enable the unrolling.
6353 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6354 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6356 Constant *C = ConstantInt::get(Ty, StartIdx);
6357 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6360 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6361 SmallVector<Metadata *, 4> MDs;
6362 // Reserve first location for self reference to the LoopID metadata node.
6363 MDs.push_back(nullptr);
6364 bool IsUnrollMetadata = false;
6365 MDNode *LoopID = L->getLoopID();
6367 // First find existing loop unrolling disable metadata.
6368 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6369 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6371 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6373 S && S->getString().startswith("llvm.loop.unroll.disable");
6375 MDs.push_back(LoopID->getOperand(i));
6379 if (!IsUnrollMetadata) {
6380 // Add runtime unroll disable metadata.
6381 LLVMContext &Context = L->getHeader()->getContext();
6382 SmallVector<Metadata *, 1> DisableOperands;
6383 DisableOperands.push_back(
6384 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6385 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6386 MDs.push_back(DisableNode);
6387 MDNode *NewLoopID = MDNode::get(Context, MDs);
6388 // Set operand 0 to refer to the loop id itself.
6389 NewLoopID->replaceOperandWith(0, NewLoopID);
6390 L->setLoopID(NewLoopID);
6394 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6395 const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6396 assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6397 bool PredicateAtRangeStart = Predicate(Range.Start);
6399 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6400 if (Predicate(TmpVF) != PredicateAtRangeStart) {
6405 return PredicateAtRangeStart;
6408 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6409 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6410 /// of VF's starting at a given VF and extending it as much as possible. Each
6411 /// vectorization decision can potentially shorten this sub-range during
6413 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6414 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6415 VFRange SubRange = {VF, MaxVF + 1};
6416 VPlans.push_back(buildVPlan(SubRange));
6421 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6423 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6425 // Look for cached value.
6426 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6427 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6428 if (ECEntryIt != EdgeMaskCache.end())
6429 return ECEntryIt->second;
6431 VPValue *SrcMask = createBlockInMask(Src, Plan);
6433 // The terminator has to be a branch inst!
6434 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6435 assert(BI && "Unexpected terminator found");
6437 if (!BI->isConditional())
6438 return EdgeMaskCache[Edge] = SrcMask;
6440 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6441 assert(EdgeMask && "No Edge Mask found for condition");
6443 if (BI->getSuccessor(0) != Dst)
6444 EdgeMask = Builder.createNot(EdgeMask);
6446 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6447 EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6449 return EdgeMaskCache[Edge] = EdgeMask;
6452 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6453 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6455 // Look for cached value.
6456 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6457 if (BCEntryIt != BlockMaskCache.end())
6458 return BCEntryIt->second;
6460 // All-one mask is modelled as no-mask following the convention for masked
6461 // load/store/gather/scatter. Initialize BlockMask to no-mask.
6462 VPValue *BlockMask = nullptr;
6464 if (OrigLoop->getHeader() == BB) {
6465 if (!CM.blockNeedsPredication(BB))
6466 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6468 // Introduce the early-exit compare IV <= BTC to form header block mask.
6469 // This is used instead of IV < TC because TC may wrap, unlike BTC.
6470 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6471 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6472 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6473 return BlockMaskCache[BB] = BlockMask;
6476 // This is the block mask. We OR all incoming edges.
6477 for (auto *Predecessor : predecessors(BB)) {
6478 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6479 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6480 return BlockMaskCache[BB] = EdgeMask;
6482 if (!BlockMask) { // BlockMask has its initialized nullptr value.
6483 BlockMask = EdgeMask;
6487 BlockMask = Builder.createOr(BlockMask, EdgeMask);
6490 return BlockMaskCache[BB] = BlockMask;
6493 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6496 const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6500 // Now check if IG is relevant for VF's in the given range.
6501 auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6502 return [=](unsigned VF) -> bool {
6503 return (VF >= 2 && // Query is illegal for VF == 1
6504 CM.getWideningDecision(I, VF) ==
6505 LoopVectorizationCostModel::CM_Interleave);
6508 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6511 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6512 // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6513 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6514 assert(I == IG->getInsertPos() &&
6515 "Generating a recipe for an adjunct member of an interleave group");
6517 VPValue *Mask = nullptr;
6518 if (Legal->isMaskRequired(I))
6519 Mask = createBlockInMask(I->getParent(), Plan);
6521 return new VPInterleaveRecipe(IG, Mask);
6524 VPWidenMemoryInstructionRecipe *
6525 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6527 if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6530 auto willWiden = [&](unsigned VF) -> bool {
6533 if (CM.isScalarAfterVectorization(I, VF) ||
6534 CM.isProfitableToScalarize(I, VF))
6536 LoopVectorizationCostModel::InstWidening Decision =
6537 CM.getWideningDecision(I, VF);
6538 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6539 "CM decision should be taken at this point.");
6540 assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6541 "Interleave memory opportunity should be caught earlier.");
6542 return Decision != LoopVectorizationCostModel::CM_Scalarize;
6545 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6548 VPValue *Mask = nullptr;
6549 if (Legal->isMaskRequired(I))
6550 Mask = createBlockInMask(I->getParent(), Plan);
6552 return new VPWidenMemoryInstructionRecipe(*I, Mask);
6555 VPWidenIntOrFpInductionRecipe *
6556 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6557 if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6558 // Check if this is an integer or fp induction. If so, build the recipe that
6559 // produces its scalar and vector values.
6560 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6561 if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6562 II.getKind() == InductionDescriptor::IK_FpInduction)
6563 return new VPWidenIntOrFpInductionRecipe(Phi);
6568 // Optimize the special case where the source is a constant integer
6569 // induction variable. Notice that we can only optimize the 'trunc' case
6570 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6571 // (c) other casts depend on pointer size.
6573 // Determine whether \p K is a truncation based on an induction variable that
6574 // can be optimized.
6575 auto isOptimizableIVTruncate =
6576 [&](Instruction *K) -> std::function<bool(unsigned)> {
6578 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6581 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6582 isOptimizableIVTruncate(I), Range))
6583 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6584 cast<TruncInst>(I));
6588 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6589 PHINode *Phi = dyn_cast<PHINode>(I);
6590 if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6593 // We know that all PHIs in non-header blocks are converted into selects, so
6594 // we don't have to worry about the insertion order and we can just use the
6595 // builder. At this point we generate the predication tree. There may be
6596 // duplications since this is a simple recursive scan, but future
6597 // optimizations will clean it up.
6599 SmallVector<VPValue *, 2> Masks;
6600 unsigned NumIncoming = Phi->getNumIncomingValues();
6601 for (unsigned In = 0; In < NumIncoming; In++) {
6603 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6604 assert((EdgeMask || NumIncoming == 1) &&
6605 "Multiple predecessors with one having a full mask");
6607 Masks.push_back(EdgeMask);
6609 return new VPBlendRecipe(Phi, Masks);
6612 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6615 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6616 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6621 auto IsVectorizableOpcode = [](unsigned Opcode) {
6623 case Instruction::Add:
6624 case Instruction::And:
6625 case Instruction::AShr:
6626 case Instruction::BitCast:
6627 case Instruction::Br:
6628 case Instruction::Call:
6629 case Instruction::FAdd:
6630 case Instruction::FCmp:
6631 case Instruction::FDiv:
6632 case Instruction::FMul:
6633 case Instruction::FNeg:
6634 case Instruction::FPExt:
6635 case Instruction::FPToSI:
6636 case Instruction::FPToUI:
6637 case Instruction::FPTrunc:
6638 case Instruction::FRem:
6639 case Instruction::FSub:
6640 case Instruction::GetElementPtr:
6641 case Instruction::ICmp:
6642 case Instruction::IntToPtr:
6643 case Instruction::Load:
6644 case Instruction::LShr:
6645 case Instruction::Mul:
6646 case Instruction::Or:
6647 case Instruction::PHI:
6648 case Instruction::PtrToInt:
6649 case Instruction::SDiv:
6650 case Instruction::Select:
6651 case Instruction::SExt:
6652 case Instruction::Shl:
6653 case Instruction::SIToFP:
6654 case Instruction::SRem:
6655 case Instruction::Store:
6656 case Instruction::Sub:
6657 case Instruction::Trunc:
6658 case Instruction::UDiv:
6659 case Instruction::UIToFP:
6660 case Instruction::URem:
6661 case Instruction::Xor:
6662 case Instruction::ZExt:
6668 if (!IsVectorizableOpcode(I->getOpcode()))
6671 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6672 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6673 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6674 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6678 auto willWiden = [&](unsigned VF) -> bool {
6679 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6680 CM.isProfitableToScalarize(I, VF)))
6682 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6683 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6684 // The following case may be scalarized depending on the VF.
6685 // The flag shows whether we use Intrinsic or a usual Call for vectorized
6686 // version of the instruction.
6687 // Is it beneficial to perform intrinsic call compared to lib call?
6688 bool NeedToScalarize;
6689 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6690 bool UseVectorIntrinsic =
6691 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6692 return UseVectorIntrinsic || !NeedToScalarize;
6694 if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6695 assert(CM.getWideningDecision(I, VF) ==
6696 LoopVectorizationCostModel::CM_Scalarize &&
6697 "Memory widening decisions should have been taken care by now");
6703 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6706 // Success: widen this instruction. We optimize the common case where
6707 // consecutive instructions can be represented by a single recipe.
6708 if (!VPBB->empty()) {
6709 VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6710 if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6714 VPBB->appendRecipe(new VPWidenRecipe(I));
6718 VPBasicBlock *VPRecipeBuilder::handleReplication(
6719 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6720 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6722 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6723 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6726 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6727 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6729 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6731 // Find if I uses a predicated instruction. If so, it will use its scalar
6732 // value. Avoid hoisting the insert-element which packs the scalar value into
6733 // a vector value, as that happens iff all users use the vector value.
6734 for (auto &Op : I->operands())
6735 if (auto *PredInst = dyn_cast<Instruction>(Op))
6736 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6737 PredInst2Recipe[PredInst]->setAlsoPack(false);
6739 // Finalize the recipe for Instr, first if it is not predicated.
6740 if (!IsPredicated) {
6741 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6742 VPBB->appendRecipe(Recipe);
6745 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6746 assert(VPBB->getSuccessors().empty() &&
6747 "VPBB has successors when handling predicated replication.");
6748 // Record predicated instructions for above packing optimizations.
6749 PredInst2Recipe[I] = Recipe;
6750 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6751 VPBlockUtils::insertBlockAfter(Region, VPBB);
6752 auto *RegSucc = new VPBasicBlock();
6753 VPBlockUtils::insertBlockAfter(RegSucc, Region);
6757 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6758 VPRecipeBase *PredRecipe,
6760 // Instructions marked for predication are replicated and placed under an
6761 // if-then construct to prevent side-effects.
6763 // Generate recipes to compute the block mask for this region.
6764 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6766 // Build the triangular if-then region.
6767 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6768 assert(Instr->getParent() && "Predicated instruction not in any basic block");
6769 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6770 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6772 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6773 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6774 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6775 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6777 // Note: first set Entry as region entry and then connect successors starting
6778 // from it in order, to propagate the "parent" of each VPBasicBlock.
6779 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6780 VPBlockUtils::connectBlocks(Pred, Exit);
6785 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6786 VPlanPtr &Plan, VPBasicBlock *VPBB) {
6787 VPRecipeBase *Recipe = nullptr;
6788 // Check if Instr should belong to an interleave memory recipe, or already
6789 // does. In the latter case Instr is irrelevant.
6790 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6791 VPBB->appendRecipe(Recipe);
6795 // Check if Instr is a memory operation that should be widened.
6796 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6797 VPBB->appendRecipe(Recipe);
6801 // Check if Instr should form some PHI recipe.
6802 if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6803 VPBB->appendRecipe(Recipe);
6806 if ((Recipe = tryToBlend(Instr, Plan))) {
6807 VPBB->appendRecipe(Recipe);
6810 if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6811 VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6815 // Check if Instr is to be widened by a general VPWidenRecipe, after
6816 // having first checked for specific widening recipes that deal with
6817 // Interleave Groups, Inductions and Phi nodes.
6818 if (tryToWiden(Instr, VPBB, Range))
6824 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6826 assert(OrigLoop->empty() && "Inner loop expected.");
6828 // Collect conditions feeding internal conditional branches; they need to be
6829 // represented in VPlan for it to model masking.
6830 SmallPtrSet<Value *, 1> NeedDef;
6832 auto *Latch = OrigLoop->getLoopLatch();
6833 for (BasicBlock *BB : OrigLoop->blocks()) {
6836 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6837 if (Branch && Branch->isConditional())
6838 NeedDef.insert(Branch->getCondition());
6841 // If the tail is to be folded by masking, the primary induction variable
6842 // needs to be represented in VPlan for it to model early-exit masking.
6843 if (CM.foldTailByMasking())
6844 NeedDef.insert(Legal->getPrimaryInduction());
6846 // Collect instructions from the original loop that will become trivially dead
6847 // in the vectorized loop. We don't need to vectorize these instructions. For
6848 // example, original induction update instructions can become dead because we
6849 // separately emit induction "steps" when generating code for the new loop.
6850 // Similarly, we create a new latch condition when setting up the structure
6851 // of the new loop, so the old one can become dead.
6852 SmallPtrSet<Instruction *, 4> DeadInstructions;
6853 collectTriviallyDeadInstructions(DeadInstructions);
6855 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6856 VFRange SubRange = {VF, MaxVF + 1};
6858 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6863 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6864 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6865 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6866 // Hold a mapping from predicated instructions to their recipes, in order to
6867 // fix their AlsoPack behavior if a user is determined to replicate and use a
6868 // scalar instead of vector value.
6869 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6871 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
6872 DenseMap<Instruction *, Instruction *> SinkAfterInverse;
6874 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
6875 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
6876 auto Plan = llvm::make_unique<VPlan>(VPBB);
6878 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
6879 // Represent values that will have defs inside VPlan.
6880 for (Value *V : NeedDef)
6881 Plan->addVPValue(V);
6883 // Scan the body of the loop in a topological order to visit each basic block
6884 // after having visited its predecessor basic blocks.
6885 LoopBlocksDFS DFS(OrigLoop);
6888 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6889 // Relevant instructions from basic block BB will be grouped into VPRecipe
6890 // ingredients and fill a new VPBasicBlock.
6891 unsigned VPBBsForBB = 0;
6892 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
6893 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
6894 VPBB = FirstVPBBForBB;
6895 Builder.setInsertPoint(VPBB);
6897 std::vector<Instruction *> Ingredients;
6899 // Organize the ingredients to vectorize from current basic block in the
6901 for (Instruction &I : BB->instructionsWithoutDebug()) {
6902 Instruction *Instr = &I;
6904 // First filter out irrelevant instructions, to ensure no recipes are
6906 if (isa<BranchInst>(Instr) ||
6907 DeadInstructions.find(Instr) != DeadInstructions.end())
6910 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
6911 // member of the IG, do not construct any Recipe for it.
6912 const InterleaveGroup<Instruction> *IG =
6913 CM.getInterleavedAccessGroup(Instr);
6914 if (IG && Instr != IG->getInsertPos() &&
6915 Range.Start >= 2 && // Query is illegal for VF == 1
6916 CM.getWideningDecision(Instr, Range.Start) ==
6917 LoopVectorizationCostModel::CM_Interleave) {
6918 auto SinkCandidate = SinkAfterInverse.find(Instr);
6919 if (SinkCandidate != SinkAfterInverse.end())
6920 Ingredients.push_back(SinkCandidate->second);
6924 // Move instructions to handle first-order recurrences, step 1: avoid
6925 // handling this instruction until after we've handled the instruction it
6927 auto SAIt = SinkAfter.find(Instr);
6928 if (SAIt != SinkAfter.end()) {
6929 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
6931 << " to vectorize a 1st order recurrence.\n");
6932 SinkAfterInverse[SAIt->second] = Instr;
6936 Ingredients.push_back(Instr);
6938 // Move instructions to handle first-order recurrences, step 2: push the
6939 // instruction to be sunk at its insertion point.
6940 auto SAInvIt = SinkAfterInverse.find(Instr);
6941 if (SAInvIt != SinkAfterInverse.end())
6942 Ingredients.push_back(SAInvIt->second);
6945 // Introduce each ingredient into VPlan.
6946 for (Instruction *Instr : Ingredients) {
6947 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
6950 // Otherwise, if all widening options failed, Instruction is to be
6951 // replicated. This may create a successor for VPBB.
6952 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
6953 Instr, Range, VPBB, PredInst2Recipe, Plan);
6954 if (NextVPBB != VPBB) {
6956 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
6962 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
6963 // may also be empty, such as the last one VPBB, reflecting original
6964 // basic-blocks with no recipes.
6965 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
6966 assert(PreEntry->empty() && "Expecting empty pre-entry block.");
6967 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
6968 VPBlockUtils::disconnectBlocks(PreEntry, Entry);
6971 std::string PlanName;
6972 raw_string_ostream RSO(PlanName);
6973 unsigned VF = Range.Start;
6975 RSO << "Initial VPlan for VF={" << VF;
6976 for (VF *= 2; VF < Range.End; VF *= 2) {
6982 Plan->setName(PlanName);
6987 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
6988 // Outer loop handling: They may require CFG and instruction level
6989 // transformations before even evaluating whether vectorization is profitable.
6990 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6991 // the vectorization pipeline.
6992 assert(!OrigLoop->empty());
6993 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6995 // Create new empty VPlan
6996 auto Plan = llvm::make_unique<VPlan>();
6998 // Build hierarchical CFG
6999 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7000 HCFGBuilder.buildHierarchicalCFG();
7002 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7005 if (EnableVPlanPredication) {
7006 VPlanPredicator VPP(*Plan);
7009 // Avoid running transformation to recipes until masked code generation in
7010 // VPlan-native path is in place.
7014 SmallPtrSet<Instruction *, 1> DeadInstructions;
7015 VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7016 Plan, Legal->getInductionVars(), DeadInstructions);
7021 Value* LoopVectorizationPlanner::VPCallbackILV::
7022 getOrCreateVectorValues(Value *V, unsigned Part) {
7023 return ILV.getOrCreateVectorValue(V, Part);
7026 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7028 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7029 IG->getInsertPos()->printAsOperand(O, false);
7032 User->getOperand(0)->printAsOperand(O);
7035 for (unsigned i = 0; i < IG->getFactor(); ++i)
7036 if (Instruction *I = IG->getMember(i))
7038 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
7041 void VPWidenRecipe::execute(VPTransformState &State) {
7042 for (auto &Instr : make_range(Begin, End))
7043 State.ILV->widenInstruction(Instr);
7046 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7047 assert(!State.Instance && "Int or FP induction being replicated.");
7048 State.ILV->widenIntOrFpInduction(IV, Trunc);
7051 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7052 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7055 void VPBlendRecipe::execute(VPTransformState &State) {
7056 State.ILV->setDebugLocFromInst(State.Builder, Phi);
7057 // We know that all PHIs in non-header blocks are converted into
7058 // selects, so we don't have to worry about the insertion order and we
7059 // can just use the builder.
7060 // At this point we generate the predication tree. There may be
7061 // duplications since this is a simple recursive scan, but future
7062 // optimizations will clean it up.
7064 unsigned NumIncoming = Phi->getNumIncomingValues();
7066 assert((User || NumIncoming == 1) &&
7067 "Multiple predecessors with predecessors having a full mask");
7068 // Generate a sequence of selects of the form:
7069 // SELECT(Mask3, In3,
7070 // SELECT(Mask2, In2,
7072 InnerLoopVectorizer::VectorParts Entry(State.UF);
7073 for (unsigned In = 0; In < NumIncoming; ++In) {
7074 for (unsigned Part = 0; Part < State.UF; ++Part) {
7075 // We might have single edge PHIs (blocks) - use an identity
7076 // 'select' for the first PHI operand.
7078 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7080 Entry[Part] = In0; // Initialize with the first incoming value.
7082 // Select between the current value and the previous incoming edge
7083 // based on the incoming mask.
7084 Value *Cond = State.get(User->getOperand(In), Part);
7086 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7090 for (unsigned Part = 0; Part < State.UF; ++Part)
7091 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7094 void VPInterleaveRecipe::execute(VPTransformState &State) {
7095 assert(!State.Instance && "Interleave group being replicated.");
7097 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7099 // Last (and currently only) operand is a mask.
7100 InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7101 VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7102 for (unsigned Part = 0; Part < State.UF; ++Part)
7103 MaskValues[Part] = State.get(Mask, Part);
7104 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7107 void VPReplicateRecipe::execute(VPTransformState &State) {
7108 if (State.Instance) { // Generate a single instance.
7109 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7110 // Insert scalar instance packing it into a vector.
7111 if (AlsoPack && State.VF > 1) {
7112 // If we're constructing lane 0, initialize to start from undef.
7113 if (State.Instance->Lane == 0) {
7115 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7116 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7118 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7123 // Generate scalar instances for all VF lanes of all UF parts, unless the
7124 // instruction is uniform inwhich case generate only the first lane for each
7126 unsigned EndLane = IsUniform ? 1 : State.VF;
7127 for (unsigned Part = 0; Part < State.UF; ++Part)
7128 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7129 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7132 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7133 assert(State.Instance && "Branch on Mask works only on single instance.");
7135 unsigned Part = State.Instance->Part;
7136 unsigned Lane = State.Instance->Lane;
7138 Value *ConditionBit = nullptr;
7139 if (!User) // Block in mask is all-one.
7140 ConditionBit = State.Builder.getTrue();
7142 VPValue *BlockInMask = User->getOperand(0);
7143 ConditionBit = State.get(BlockInMask, Part);
7144 if (ConditionBit->getType()->isVectorTy())
7145 ConditionBit = State.Builder.CreateExtractElement(
7146 ConditionBit, State.Builder.getInt32(Lane));
7149 // Replace the temporary unreachable terminator with a new conditional branch,
7150 // whose two destinations will be set later when they are created.
7151 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7152 assert(isa<UnreachableInst>(CurrentTerminator) &&
7153 "Expected to replace unreachable terminator with conditional branch.");
7154 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7155 CondBr->setSuccessor(0, nullptr);
7156 ReplaceInstWithInst(CurrentTerminator, CondBr);
7159 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7160 assert(State.Instance && "Predicated instruction PHI works per instance.");
7161 Instruction *ScalarPredInst = cast<Instruction>(
7162 State.ValueMap.getScalarValue(PredInst, *State.Instance));
7163 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7164 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7165 assert(PredicatingBB && "Predicated block has no single predecessor.");
7167 // By current pack/unpack logic we need to generate only a single phi node: if
7168 // a vector value for the predicated instruction exists at this point it means
7169 // the instruction has vector users only, and a phi for the vector value is
7170 // needed. In this case the recipe of the predicated instruction is marked to
7171 // also do that packing, thereby "hoisting" the insert-element sequence.
7172 // Otherwise, a phi node for the scalar value is needed.
7173 unsigned Part = State.Instance->Part;
7174 if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7175 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7176 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7177 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7178 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7179 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7180 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7182 Type *PredInstType = PredInst->getType();
7183 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7184 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7185 Phi->addIncoming(ScalarPredInst, PredicatedBB);
7186 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7190 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7192 return State.ILV->vectorizeMemoryInstruction(&Instr);
7194 // Last (and currently only) operand is a mask.
7195 InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7196 VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7197 for (unsigned Part = 0; Part < State.UF; ++Part)
7198 MaskValues[Part] = State.get(Mask, Part);
7199 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7202 // Process the loop in the VPlan-native vectorization path. This path builds
7203 // VPlan upfront in the vectorization pipeline, which allows to apply
7204 // VPlan-to-VPlan transformations from the very beginning without modifying the
7206 static bool processLoopInVPlanNativePath(
7207 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7208 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7209 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7210 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7211 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7213 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7214 Function *F = L->getHeader()->getParent();
7215 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7216 LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7218 // Use the planner for outer loop vectorization.
7219 // TODO: CM is not used at this point inside the planner. Turn CM into an
7220 // optional argument if we don't need it in the future.
7221 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7223 // Get user vectorization factor.
7224 const unsigned UserVF = Hints.getWidth();
7226 // Check the function attributes and profiles to find out if this function
7227 // should be optimized for size.
7229 Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7231 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
7233 // Plan how to best vectorize, return the best VF and its cost.
7234 const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
7236 // If we are stress testing VPlan builds, do not attempt to generate vector
7237 // code. Masked vector code generation support will follow soon.
7238 // Also, do not attempt to vectorize if no vector code will be produced.
7239 if (VPlanBuildStressTest || EnableVPlanPredication ||
7240 VectorizationFactor::Disabled() == VF)
7243 LVP.setBestPlan(VF.Width, 1);
7245 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7247 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7248 << L->getHeader()->getParent()->getName() << "\"\n");
7249 LVP.executePlan(LB, DT);
7251 // Mark the loop as already vectorized to avoid vectorizing again.
7252 Hints.setAlreadyVectorized();
7254 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7258 bool LoopVectorizePass::processLoop(Loop *L) {
7259 assert((EnableVPlanNativePath || L->empty()) &&
7260 "VPlan-native path is not enabled. Only process inner loops.");
7263 const std::string DebugLocStr = getDebugLocString(L);
7266 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7267 << L->getHeader()->getParent()->getName() << "\" from "
7268 << DebugLocStr << "\n");
7270 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7273 dbgs() << "LV: Loop hints:"
7275 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7277 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7280 << " width=" << Hints.getWidth()
7281 << " unroll=" << Hints.getInterleave() << "\n");
7283 // Function containing loop
7284 Function *F = L->getHeader()->getParent();
7286 // Looking at the diagnostic output is the only way to determine if a loop
7287 // was vectorized (other than looking at the IR or machine code), so it
7288 // is important to generate an optimization remark for each loop. Most of
7289 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7290 // generated as OptimizationRemark and OptimizationRemarkMissed are
7291 // less verbose reporting vectorized loops and unvectorized loops that may
7292 // benefit from vectorization, respectively.
7294 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7295 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7299 PredicatedScalarEvolution PSE(*SE, *L);
7301 // Check if it is legal to vectorize the loop.
7302 LoopVectorizationRequirements Requirements(*ORE);
7303 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7304 &Requirements, &Hints, DB, AC);
7305 if (!LVL.canVectorize(EnableVPlanNativePath)) {
7306 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7307 Hints.emitRemarkWithHints();
7311 // Check the function attributes and profiles to find out if this function
7312 // should be optimized for size.
7314 Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7316 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
7318 // Entrance to the VPlan-native vectorization path. Outer loops are processed
7319 // here. They may require CFG and instruction level transformations before
7320 // even evaluating whether vectorization is profitable. Since we cannot modify
7321 // the incoming IR, we need to build VPlan upfront in the vectorization
7324 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7325 ORE, BFI, PSI, Hints);
7327 assert(L->empty() && "Inner loop expected.");
7328 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7329 // count by optimizing for size, to minimize overheads.
7330 // Prefer constant trip counts over profile data, over upper bound estimate.
7331 unsigned ExpectedTC = 0;
7332 bool HasExpectedTC = false;
7333 if (const SCEVConstant *ConstExits =
7334 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7335 const APInt &ExitsCount = ConstExits->getAPInt();
7336 // We are interested in small values for ExpectedTC. Skip over those that
7337 // can't fit an unsigned.
7338 if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7339 ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7340 HasExpectedTC = true;
7343 // ExpectedTC may be large because it's bound by a variable. Check
7344 // profiling information to validate we should vectorize.
7345 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7346 auto EstimatedTC = getLoopEstimatedTripCount(L);
7348 ExpectedTC = *EstimatedTC;
7349 HasExpectedTC = true;
7352 if (!HasExpectedTC) {
7353 ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7354 HasExpectedTC = (ExpectedTC > 0);
7357 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7358 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7359 << "This loop is worth vectorizing only if no scalar "
7360 << "iteration overheads are incurred.");
7361 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7362 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7364 LLVM_DEBUG(dbgs() << "\n");
7365 // Loops with a very small trip count are considered for vectorization
7366 // under OptForSize, thereby making sure the cost of their loop body is
7367 // dominant, free of runtime guards and scalar iteration overheads.
7372 // Check the function attributes to see if implicit floats are allowed.
7373 // FIXME: This check doesn't seem possibly correct -- what if the loop is
7374 // an integer loop and the vector instructions selected are purely integer
7375 // vector instructions?
7376 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7377 LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
7378 "attribute is used.\n");
7379 ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
7380 "NoImplicitFloat", L)
7381 << "loop not vectorized due to NoImplicitFloat attribute");
7382 Hints.emitRemarkWithHints();
7386 // Check if the target supports potentially unsafe FP vectorization.
7387 // FIXME: Add a check for the type of safety issue (denormal, signaling)
7388 // for the target we're vectorizing for, to make sure none of the
7389 // additional fp-math flags can help.
7390 if (Hints.isPotentiallyUnsafe() &&
7391 TTI->isFPVectorizationPotentiallyUnsafe()) {
7393 dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
7395 createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
7396 << "loop not vectorized due to unsafe FP support.");
7397 Hints.emitRemarkWithHints();
7401 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7402 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7404 // If an override option has been passed in for interleaved accesses, use it.
7405 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7406 UseInterleaved = EnableInterleavedMemAccesses;
7408 // Analyze interleaved memory accesses.
7409 if (UseInterleaved) {
7410 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7413 // Use the cost model.
7414 LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
7416 CM.collectValuesToIgnore();
7418 // Use the planner for vectorization.
7419 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7421 // Get user vectorization factor.
7422 unsigned UserVF = Hints.getWidth();
7424 // Plan how to best vectorize, return the best VF and its cost.
7425 Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
7427 VectorizationFactor VF = VectorizationFactor::Disabled();
7429 unsigned UserIC = Hints.getInterleave();
7433 // Select the interleave count.
7434 IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
7437 // Identify the diagnostic messages that should be produced.
7438 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7439 bool VectorizeLoop = true, InterleaveLoop = true;
7440 if (Requirements.doesNotMeet(F, L, Hints)) {
7441 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7443 Hints.emitRemarkWithHints();
7447 if (VF.Width == 1) {
7448 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7449 VecDiagMsg = std::make_pair(
7450 "VectorizationNotBeneficial",
7451 "the cost-model indicates that vectorization is not beneficial");
7452 VectorizeLoop = false;
7455 if (!MaybeVF && UserIC > 1) {
7456 // Tell the user interleaving was avoided up-front, despite being explicitly
7458 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7459 "interleaving should be avoided up front\n");
7460 IntDiagMsg = std::make_pair(
7461 "InterleavingAvoided",
7462 "Ignoring UserIC, because interleaving was avoided up front");
7463 InterleaveLoop = false;
7464 } else if (IC == 1 && UserIC <= 1) {
7465 // Tell the user interleaving is not beneficial.
7466 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7467 IntDiagMsg = std::make_pair(
7468 "InterleavingNotBeneficial",
7469 "the cost-model indicates that interleaving is not beneficial");
7470 InterleaveLoop = false;
7472 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7473 IntDiagMsg.second +=
7474 " and is explicitly disabled or interleave count is set to 1";
7476 } else if (IC > 1 && UserIC == 1) {
7477 // Tell the user interleaving is beneficial, but it explicitly disabled.
7479 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7480 IntDiagMsg = std::make_pair(
7481 "InterleavingBeneficialButDisabled",
7482 "the cost-model indicates that interleaving is beneficial "
7483 "but is explicitly disabled or interleave count is set to 1");
7484 InterleaveLoop = false;
7487 // Override IC if user provided an interleave count.
7488 IC = UserIC > 0 ? UserIC : IC;
7490 // Emit diagnostic messages, if any.
7491 const char *VAPassName = Hints.vectorizeAnalysisPassName();
7492 if (!VectorizeLoop && !InterleaveLoop) {
7493 // Do not vectorize or interleaving the loop.
7495 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7496 L->getStartLoc(), L->getHeader())
7497 << VecDiagMsg.second;
7500 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7501 L->getStartLoc(), L->getHeader())
7502 << IntDiagMsg.second;
7505 } else if (!VectorizeLoop && InterleaveLoop) {
7506 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7508 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7509 L->getStartLoc(), L->getHeader())
7510 << VecDiagMsg.second;
7512 } else if (VectorizeLoop && !InterleaveLoop) {
7513 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7514 << ") in " << DebugLocStr << '\n');
7516 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7517 L->getStartLoc(), L->getHeader())
7518 << IntDiagMsg.second;
7520 } else if (VectorizeLoop && InterleaveLoop) {
7521 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7522 << ") in " << DebugLocStr << '\n');
7523 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7526 LVP.setBestPlan(VF.Width, IC);
7528 using namespace ore;
7529 bool DisableRuntimeUnroll = false;
7530 MDNode *OrigLoopID = L->getLoopID();
7532 if (!VectorizeLoop) {
7533 assert(IC > 1 && "interleave count should not be 1 or 0");
7534 // If we decided that it is not legal to vectorize the loop, then
7536 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7538 LVP.executePlan(Unroller, DT);
7541 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7543 << "interleaved loop (interleaved count: "
7544 << NV("InterleaveCount", IC) << ")";
7547 // If we decided that it is *legal* to vectorize the loop, then do it.
7548 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7550 LVP.executePlan(LB, DT);
7553 // Add metadata to disable runtime unrolling a scalar loop when there are
7554 // no runtime checks about strides and memory. A scalar loop that is
7555 // rarely used is not worth unrolling.
7556 if (!LB.areSafetyChecksAdded())
7557 DisableRuntimeUnroll = true;
7559 // Report the vectorization decision.
7561 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7563 << "vectorized loop (vectorization width: "
7564 << NV("VectorizationFactor", VF.Width)
7565 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7569 Optional<MDNode *> RemainderLoopID =
7570 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7571 LLVMLoopVectorizeFollowupEpilogue});
7572 if (RemainderLoopID.hasValue()) {
7573 L->setLoopID(RemainderLoopID.getValue());
7575 if (DisableRuntimeUnroll)
7576 AddRuntimeUnrollDisableMetaData(L);
7578 // Mark the loop as already vectorized to avoid vectorizing again.
7579 Hints.setAlreadyVectorized();
7582 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7586 bool LoopVectorizePass::runImpl(
7587 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7588 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7589 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7590 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7591 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7606 // 1. the target claims to have no vector registers, and
7607 // 2. interleaving won't help ILP.
7609 // The second condition is necessary because, even if the target has no
7610 // vector registers, loop vectorization may still enable scalar
7612 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7615 bool Changed = false;
7617 // The vectorizer requires loops to be in simplified form.
7618 // Since simplification may add new inner loops, it has to run before the
7619 // legality and profitability checks. This means running the loop vectorizer
7620 // will simplify all loops, regardless of whether anything end up being
7624 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7626 // Build up a worklist of inner-loops to vectorize. This is necessary as
7627 // the act of vectorizing or partially unrolling a loop creates new loops
7628 // and can invalidate iterators across the loops.
7629 SmallVector<Loop *, 8> Worklist;
7632 collectSupportedLoops(*L, LI, ORE, Worklist);
7634 LoopsAnalyzed += Worklist.size();
7636 // Now walk the identified inner loops.
7637 while (!Worklist.empty()) {
7638 Loop *L = Worklist.pop_back_val();
7640 // For the inner loops we actually process, form LCSSA to simplify the
7642 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7644 Changed |= processLoop(L);
7647 // Process each loop nest in the function.
7651 PreservedAnalyses LoopVectorizePass::run(Function &F,
7652 FunctionAnalysisManager &AM) {
7653 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7654 auto &LI = AM.getResult<LoopAnalysis>(F);
7655 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7656 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7657 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7658 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7659 auto &AA = AM.getResult<AAManager>(F);
7660 auto &AC = AM.getResult<AssumptionAnalysis>(F);
7661 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7662 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7663 MemorySSA *MSSA = EnableMSSALoopDependency
7664 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7667 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7668 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7669 [&](Loop &L) -> const LoopAccessInfo & {
7670 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7671 return LAM.getResult<LoopAccessAnalysis>(L, AR);
7673 const ModuleAnalysisManager &MAM =
7674 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7675 ProfileSummaryInfo *PSI =
7676 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7678 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7680 return PreservedAnalyses::all();
7681 PreservedAnalyses PA;
7683 // We currently do not preserve loopinfo/dominator analyses with outer loop
7684 // vectorization. Until this is addressed, mark these analyses as preserved
7685 // only for non-VPlan-native path.
7686 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7687 if (!EnableVPlanNativePath) {
7688 PA.preserve<LoopAnalysis>();
7689 PA.preserve<DominatorTreeAnalysis>();
7691 PA.preserve<BasicAA>();
7692 PA.preserve<GlobalsAA>();