1 //===- LoopVectorizationLegality.cpp --------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file provides loop vectorization legality analysis. Original code
10 // resided in LoopVectorize.cpp for a long time.
12 // At this point, it is implemented as a utility class, not as an analysis
13 // pass. It should be easy to create an analysis pass around it if there
14 // is a need (but D45420 needs to happen first).
16 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
17 #include "llvm/Analysis/VectorUtils.h"
18 #include "llvm/IR/IntrinsicInst.h"
22 #define LV_NAME "loop-vectorize"
23 #define DEBUG_TYPE LV_NAME
25 extern cl::opt<bool> EnableVPlanPredication;
28 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
29 cl::desc("Enable if-conversion during vectorization."));
31 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
32 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
33 cl::desc("The maximum allowed number of runtime memory checks with a "
34 "vectorize(enable) pragma."));
36 static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
37 "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
38 cl::desc("The maximum number of SCEV checks allowed."));
40 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
41 "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
42 cl::desc("The maximum number of SCEV checks allowed with a "
43 "vectorize(enable) pragma"));
45 /// Maximum vectorization interleave count.
46 static const unsigned MaxInterleaveFactor = 16;
51 static void debugVectorizationFailure(const StringRef DebugMsg,
53 dbgs() << "LV: Not vectorizing: " << DebugMsg;
62 OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,
66 Value *CodeRegion = TheLoop->getHeader();
67 DebugLoc DL = TheLoop->getStartLoc();
70 CodeRegion = I->getParent();
71 // If there is no debug location attached to the instruction, revert back to
74 DL = I->getDebugLoc();
77 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
78 R << "loop not vectorized: ";
82 bool LoopVectorizeHints::Hint::validate(unsigned Val) {
85 return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
87 return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
91 return (Val == 0 || Val == 1);
96 LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
97 bool InterleaveOnlyWhenForced,
98 OptimizationRemarkEmitter &ORE)
99 : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
100 Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
101 Force("vectorize.enable", FK_Undefined, HK_FORCE),
102 IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
103 // Populate values with existing loop metadata.
104 getHintsFromMetadata();
106 // force-vector-interleave overrides DisableInterleaving.
107 if (VectorizerParams::isInterleaveForced())
108 Interleave.Value = VectorizerParams::VectorizationInterleave;
110 if (IsVectorized.Value != 1)
111 // If the vectorization width and interleaving count are both 1 then
112 // consider the loop to have been already vectorized because there's
113 // nothing more that we can do.
114 IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
115 LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
116 << "LV: Interleaving disabled by the pass manager\n");
119 void LoopVectorizeHints::setAlreadyVectorized() {
120 LLVMContext &Context = TheLoop->getHeader()->getContext();
122 MDNode *IsVectorizedMD = MDNode::get(
124 {MDString::get(Context, "llvm.loop.isvectorized"),
125 ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
126 MDNode *LoopID = TheLoop->getLoopID();
128 makePostTransformationMetadata(Context, LoopID,
129 {Twine(Prefix(), "vectorize.").str(),
130 Twine(Prefix(), "interleave.").str()},
132 TheLoop->setLoopID(NewLoopID);
134 // Update internal cache.
135 IsVectorized.Value = 1;
138 bool LoopVectorizeHints::allowVectorization(
139 Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
140 if (getForce() == LoopVectorizeHints::FK_Disabled) {
141 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
142 emitRemarkWithHints();
146 if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
147 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
148 emitRemarkWithHints();
152 if (getIsVectorized() == 1) {
153 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
154 // FIXME: Add interleave.disable metadata. This will allow
155 // vectorize.disable to be used without disabling the pass and errors
156 // to differentiate between disabled vectorization and a width of 1.
158 return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
159 "AllDisabled", L->getStartLoc(),
161 << "loop not vectorized: vectorization and interleaving are "
162 "explicitly disabled, or the loop has already been "
171 void LoopVectorizeHints::emitRemarkWithHints() const {
175 if (Force.Value == LoopVectorizeHints::FK_Disabled)
176 return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
177 TheLoop->getStartLoc(),
178 TheLoop->getHeader())
179 << "loop not vectorized: vectorization is explicitly disabled";
181 OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
182 TheLoop->getStartLoc(), TheLoop->getHeader());
183 R << "loop not vectorized";
184 if (Force.Value == LoopVectorizeHints::FK_Enabled) {
185 R << " (Force=" << NV("Force", true);
186 if (Width.Value != 0)
187 R << ", Vector Width=" << NV("VectorWidth", Width.Value);
188 if (Interleave.Value != 0)
189 R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
197 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
200 if (getForce() == LoopVectorizeHints::FK_Disabled)
202 if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
204 return OptimizationRemarkAnalysis::AlwaysPrint;
207 void LoopVectorizeHints::getHintsFromMetadata() {
208 MDNode *LoopID = TheLoop->getLoopID();
212 // First operand should refer to the loop id itself.
213 assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
214 assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
216 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
217 const MDString *S = nullptr;
218 SmallVector<Metadata *, 4> Args;
220 // The expected hint is either a MDString or a MDNode with the first
221 // operand a MDString.
222 if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
223 if (!MD || MD->getNumOperands() == 0)
225 S = dyn_cast<MDString>(MD->getOperand(0));
226 for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
227 Args.push_back(MD->getOperand(i));
229 S = dyn_cast<MDString>(LoopID->getOperand(i));
230 assert(Args.size() == 0 && "too many arguments for MDString");
236 // Check if the hint starts with the loop metadata prefix.
237 StringRef Name = S->getString();
238 if (Args.size() == 1)
239 setHint(Name, Args[0]);
243 void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
244 if (!Name.startswith(Prefix()))
246 Name = Name.substr(Prefix().size(), StringRef::npos);
248 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
251 unsigned Val = C->getZExtValue();
253 Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
254 for (auto H : Hints) {
255 if (Name == H->Name) {
256 if (H->validate(Val))
259 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
265 bool LoopVectorizationRequirements::doesNotMeet(
266 Function *F, Loop *L, const LoopVectorizeHints &Hints) {
267 const char *PassName = Hints.vectorizeAnalysisPassName();
269 if (UnsafeAlgebraInst && !Hints.allowReordering()) {
271 return OptimizationRemarkAnalysisFPCommute(
272 PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
273 UnsafeAlgebraInst->getParent())
274 << "loop not vectorized: cannot prove it is safe to reorder "
275 "floating-point operations";
280 // Test if runtime memcheck thresholds are exceeded.
281 bool PragmaThresholdReached =
282 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
283 bool ThresholdReached =
284 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
285 if ((ThresholdReached && !Hints.allowReordering()) ||
286 PragmaThresholdReached) {
288 return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
291 << "loop not vectorized: cannot prove it is safe to reorder "
294 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
301 // Return true if the inner loop \p Lp is uniform with regard to the outer loop
302 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
303 // executing the inner loop will execute the same iterations). This check is
304 // very constrained for now but it will be relaxed in the future. \p Lp is
305 // considered uniform if it meets all the following conditions:
306 // 1) it has a canonical IV (starting from 0 and with stride 1),
307 // 2) its latch terminator is a conditional branch and,
308 // 3) its latch condition is a compare instruction whose operands are the
309 // canonical IV and an OuterLp invariant.
310 // This check doesn't take into account the uniformity of other conditions not
311 // related to the loop latch because they don't affect the loop uniformity.
313 // NOTE: We decided to keep all these checks and its associated documentation
314 // together so that we can easily have a picture of the current supported loop
315 // nests. However, some of the current checks don't depend on \p OuterLp and
316 // would be redundantly executed for each \p Lp if we invoked this function for
317 // different candidate outer loops. This is not the case for now because we
318 // don't currently have the infrastructure to evaluate multiple candidate outer
319 // loops and \p OuterLp will be a fixed parameter while we only support explicit
320 // outer loop vectorization. It's also very likely that these checks go away
321 // before introducing the aforementioned infrastructure. However, if this is not
322 // the case, we should move the \p OuterLp independent checks to a separate
323 // function that is only executed once for each \p Lp.
324 static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
325 assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
327 // If Lp is the outer loop, it's uniform by definition.
330 assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
333 PHINode *IV = Lp->getCanonicalInductionVariable();
335 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
340 BasicBlock *Latch = Lp->getLoopLatch();
341 auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
342 if (!LatchBr || LatchBr->isUnconditional()) {
343 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
348 auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
351 dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
355 Value *CondOp0 = LatchCmp->getOperand(0);
356 Value *CondOp1 = LatchCmp->getOperand(1);
357 Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
358 if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
359 !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
360 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
367 // Return true if \p Lp and all its nested loops are uniform with regard to \p
369 static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
370 if (!isUniformLoop(Lp, OuterLp))
373 // Check if nested loops are uniform.
374 for (Loop *SubLp : *Lp)
375 if (!isUniformLoopNest(SubLp, OuterLp))
381 /// Check whether it is safe to if-convert this phi node.
383 /// Phi nodes with constant expressions that can trap are not safe to if
385 static bool canIfConvertPHINodes(BasicBlock *BB) {
386 for (PHINode &Phi : BB->phis()) {
387 for (Value *V : Phi.incoming_values())
388 if (auto *C = dyn_cast<Constant>(V))
395 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
396 if (Ty->isPointerTy())
397 return DL.getIntPtrType(Ty);
399 // It is possible that char's or short's overflow when we ask for the loop's
400 // trip count, work around this by changing the type size.
401 if (Ty->getScalarSizeInBits() < 32)
402 return Type::getInt32Ty(Ty->getContext());
407 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
408 Ty0 = convertPointerToIntegerType(DL, Ty0);
409 Ty1 = convertPointerToIntegerType(DL, Ty1);
410 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
415 /// Check that the instruction has outside loop users and is not an
416 /// identified reduction variable.
417 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
418 SmallPtrSetImpl<Value *> &AllowedExit) {
419 // Reductions, Inductions and non-header phis are allowed to have exit users. All
420 // other instructions must not have external users.
421 if (!AllowedExit.count(Inst))
422 // Check that all of the users of the loop are inside the BB.
423 for (User *U : Inst->users()) {
424 Instruction *UI = cast<Instruction>(U);
425 // This user may be a reduction exit value.
426 if (!TheLoop->contains(UI)) {
427 LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
434 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
435 const ValueToValueMap &Strides =
436 getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
438 int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
439 if (Stride == 1 || Stride == -1)
444 bool LoopVectorizationLegality::isUniform(Value *V) {
445 return LAI->isUniform(V);
448 void LoopVectorizationLegality::reportVectorizationFailure(
449 const StringRef DebugMsg, const StringRef OREMsg,
450 const StringRef ORETag, Instruction *I) const {
451 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
452 ORE->emit(createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
453 ORETag, TheLoop, I) << OREMsg);
456 bool LoopVectorizationLegality::canVectorizeOuterLoop() {
457 assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
458 // Store the result and return it at the end instead of exiting early, in case
459 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
461 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
463 for (BasicBlock *BB : TheLoop->blocks()) {
464 // Check whether the BB terminator is a BranchInst. Any other terminator is
465 // not supported yet.
466 auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
468 reportVectorizationFailure("Unsupported basic block terminator",
469 "loop control flow is not understood by vectorizer",
477 // Check whether the BranchInst is a supported one. Only unconditional
478 // branches, conditional branches with an outer loop invariant condition or
479 // backedges are supported.
480 // FIXME: We skip these checks when VPlan predication is enabled as we
481 // want to allow divergent branches. This whole check will be removed
482 // once VPlan predication is on by default.
483 if (!EnableVPlanPredication && Br && Br->isConditional() &&
484 !TheLoop->isLoopInvariant(Br->getCondition()) &&
485 !LI->isLoopHeader(Br->getSuccessor(0)) &&
486 !LI->isLoopHeader(Br->getSuccessor(1))) {
487 reportVectorizationFailure("Unsupported conditional branch",
488 "loop control flow is not understood by vectorizer",
497 // Check whether inner loops are uniform. At this point, we only support
498 // simple outer loops scenarios with uniform nested loops.
499 if (!isUniformLoopNest(TheLoop /*loop nest*/,
500 TheLoop /*context outer loop*/)) {
501 reportVectorizationFailure("Outer loop contains divergent loops",
502 "loop control flow is not understood by vectorizer",
510 // Check whether we are able to set up outer loop induction.
511 if (!setupOuterLoopInductions()) {
512 reportVectorizationFailure("Unsupported outer loop Phi(s)",
513 "Unsupported outer loop Phi(s)",
524 void LoopVectorizationLegality::addInductionPhi(
525 PHINode *Phi, const InductionDescriptor &ID,
526 SmallPtrSetImpl<Value *> &AllowedExit) {
527 Inductions[Phi] = ID;
529 // In case this induction also comes with casts that we know we can ignore
530 // in the vectorized loop body, record them here. All casts could be recorded
531 // here for ignoring, but suffices to record only the first (as it is the
532 // only one that may bw used outside the cast sequence).
533 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
535 InductionCastsToIgnore.insert(*Casts.begin());
537 Type *PhiTy = Phi->getType();
538 const DataLayout &DL = Phi->getModule()->getDataLayout();
540 // Get the widest type.
541 if (!PhiTy->isFloatingPointTy()) {
543 WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
545 WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
548 // Int inductions are special because we only allow one IV.
549 if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
550 ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
551 isa<Constant>(ID.getStartValue()) &&
552 cast<Constant>(ID.getStartValue())->isNullValue()) {
554 // Use the phi node with the widest type as induction. Use the last
555 // one if there are multiple (no good reason for doing this other
556 // than it is expedient). We've checked that it begins at zero and
557 // steps by one, so this is a canonical induction variable.
558 if (!PrimaryInduction || PhiTy == WidestIndTy)
559 PrimaryInduction = Phi;
562 // Both the PHI node itself, and the "post-increment" value feeding
563 // back into the PHI node may have external users.
564 // We can allow those uses, except if the SCEVs we have for them rely
565 // on predicates that only hold within the loop, since allowing the exit
566 // currently means re-using this SCEV outside the loop (see PR33706 for more
568 if (PSE.getUnionPredicate().isAlwaysTrue()) {
569 AllowedExit.insert(Phi);
570 AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
573 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
576 bool LoopVectorizationLegality::setupOuterLoopInductions() {
577 BasicBlock *Header = TheLoop->getHeader();
579 // Returns true if a given Phi is a supported induction.
580 auto isSupportedPhi = [&](PHINode &Phi) -> bool {
581 InductionDescriptor ID;
582 if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
583 ID.getKind() == InductionDescriptor::IK_IntInduction) {
584 addInductionPhi(&Phi, ID, AllowedExit);
587 // Bail out for any Phi in the outer loop header that is not a supported
591 << "LV: Found unsupported PHI for outer loop vectorization.\n");
596 if (llvm::all_of(Header->phis(), isSupportedPhi))
602 bool LoopVectorizationLegality::canVectorizeInstrs() {
603 BasicBlock *Header = TheLoop->getHeader();
605 // Look for the attribute signaling the absence of NaNs.
606 Function &F = *Header->getParent();
608 F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
610 // For each block in the loop.
611 for (BasicBlock *BB : TheLoop->blocks()) {
612 // Scan the instructions in the block and look for hazards.
613 for (Instruction &I : *BB) {
614 if (auto *Phi = dyn_cast<PHINode>(&I)) {
615 Type *PhiTy = Phi->getType();
616 // Check that this PHI type is allowed.
617 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
618 !PhiTy->isPointerTy()) {
619 reportVectorizationFailure("Found a non-int non-pointer PHI",
620 "loop control flow is not understood by vectorizer",
625 // If this PHINode is not in the header block, then we know that we
626 // can convert it to select during if-conversion. No need to check if
627 // the PHIs in this block are induction or reduction variables.
629 // Non-header phi nodes that have outside uses can be vectorized. Add
630 // them to the list of allowed exits.
631 // Unsafe cyclic dependencies with header phis are identified during
632 // legalization for reduction, induction and first order
634 AllowedExit.insert(&I);
638 // We only allow if-converted PHIs with exactly two incoming values.
639 if (Phi->getNumIncomingValues() != 2) {
640 reportVectorizationFailure("Found an invalid PHI",
641 "loop control flow is not understood by vectorizer",
642 "CFGNotUnderstood", Phi);
646 RecurrenceDescriptor RedDes;
647 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
649 if (RedDes.hasUnsafeAlgebra())
650 Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
651 AllowedExit.insert(RedDes.getLoopExitInstr());
652 Reductions[Phi] = RedDes;
656 // TODO: Instead of recording the AllowedExit, it would be good to record the
657 // complementary set: NotAllowedExit. These include (but may not be
659 // 1. Reduction phis as they represent the one-before-last value, which
660 // is not available when vectorized
661 // 2. Induction phis and increment when SCEV predicates cannot be used
662 // outside the loop - see addInductionPhi
663 // 3. Non-Phis with outside uses when SCEV predicates cannot be used
664 // outside the loop - see call to hasOutsideLoopUser in the non-phi
666 // 4. FirstOrderRecurrence phis that can possibly be handled by
668 // By recording these, we can then reason about ways to vectorize each
669 // of these NotAllowedExit.
670 InductionDescriptor ID;
671 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
672 addInductionPhi(Phi, ID, AllowedExit);
673 if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
674 Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
678 if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
680 FirstOrderRecurrences.insert(Phi);
684 // As a last resort, coerce the PHI to a AddRec expression
685 // and re-try classifying it a an induction PHI.
686 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
687 addInductionPhi(Phi, ID, AllowedExit);
691 reportVectorizationFailure("Found an unidentified PHI",
692 "value that could not be identified as "
693 "reduction is used outside the loop",
694 "NonReductionValueUsedOutsideLoop", Phi);
696 } // end of PHI handling
698 // We handle calls that:
699 // * Are debug info intrinsics.
700 // * Have a mapping to an IR intrinsic.
701 // * Have a vector version available.
702 auto *CI = dyn_cast<CallInst>(&I);
703 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
704 !isa<DbgInfoIntrinsic>(CI) &&
705 !(CI->getCalledFunction() && TLI &&
706 TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
707 // If the call is a recognized math libary call, it is likely that
708 // we can vectorize it given loosened floating-point constraints.
711 TLI && CI->getCalledFunction() &&
712 CI->getType()->isFloatingPointTy() &&
713 TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
714 TLI->hasOptimizedCodeGen(Func);
717 // TODO: Ideally, we should not use clang-specific language here,
718 // but it's hard to provide meaningful yet generic advice.
719 // Also, should this be guarded by allowExtraAnalysis() and/or be part
720 // of the returned info from isFunctionVectorizable()?
721 reportVectorizationFailure("Found a non-intrinsic callsite",
722 "library call cannot be vectorized. "
723 "Try compiling with -fno-math-errno, -ffast-math, "
725 "CantVectorizeLibcall", CI);
727 reportVectorizationFailure("Found a non-intrinsic callsite",
728 "call instruction cannot be vectorized",
729 "CantVectorizeLibcall", CI);
734 // Some intrinsics have scalar arguments and should be same in order for
735 // them to be vectorized (i.e. loop invariant).
737 auto *SE = PSE.getSE();
738 Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
739 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
740 if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
741 if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
742 reportVectorizationFailure("Found unvectorizable intrinsic",
743 "intrinsic instruction cannot be vectorized",
744 "CantVectorizeIntrinsic", CI);
750 // Check that the instruction return type is vectorizable.
751 // Also, we can't vectorize extractelement instructions.
752 if ((!VectorType::isValidElementType(I.getType()) &&
753 !I.getType()->isVoidTy()) ||
754 isa<ExtractElementInst>(I)) {
755 reportVectorizationFailure("Found unvectorizable type",
756 "instruction return type cannot be vectorized",
757 "CantVectorizeInstructionReturnType", &I);
761 // Check that the stored type is vectorizable.
762 if (auto *ST = dyn_cast<StoreInst>(&I)) {
763 Type *T = ST->getValueOperand()->getType();
764 if (!VectorType::isValidElementType(T)) {
765 reportVectorizationFailure("Store instruction cannot be vectorized",
766 "store instruction cannot be vectorized",
767 "CantVectorizeStore", ST);
771 // For nontemporal stores, check that a nontemporal vector version is
772 // supported on the target.
773 if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
774 // Arbitrarily try a vector of 2 elements.
775 Type *VecTy = VectorType::get(T, /*NumElements=*/2);
776 assert(VecTy && "did not find vectorized version of stored type");
777 unsigned Alignment = getLoadStoreAlignment(ST);
778 if (!TTI->isLegalNTStore(VecTy, Alignment)) {
779 reportVectorizationFailure(
780 "nontemporal store instruction cannot be vectorized",
781 "nontemporal store instruction cannot be vectorized",
782 "CantVectorizeNontemporalStore", ST);
787 } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
788 if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
789 // For nontemporal loads, check that a nontemporal vector version is
790 // supported on the target (arbitrarily try a vector of 2 elements).
791 Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
792 assert(VecTy && "did not find vectorized version of load type");
793 unsigned Alignment = getLoadStoreAlignment(LD);
794 if (!TTI->isLegalNTLoad(VecTy, Alignment)) {
795 reportVectorizationFailure(
796 "nontemporal load instruction cannot be vectorized",
797 "nontemporal load instruction cannot be vectorized",
798 "CantVectorizeNontemporalLoad", LD);
803 // FP instructions can allow unsafe algebra, thus vectorizable by
804 // non-IEEE-754 compliant SIMD units.
805 // This applies to floating-point math operations and calls, not memory
806 // operations, shuffles, or casts, as they don't change precision or
808 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
810 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
811 Hints->setPotentiallyUnsafe();
814 // Reduction instructions are allowed to have exit users.
815 // All other instructions must not have external users.
816 if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
817 // We can safely vectorize loops where instructions within the loop are
818 // used outside the loop only if the SCEV predicates within the loop is
819 // same as outside the loop. Allowing the exit means reusing the SCEV
821 if (PSE.getUnionPredicate().isAlwaysTrue()) {
822 AllowedExit.insert(&I);
825 reportVectorizationFailure("Value cannot be used outside the loop",
826 "value cannot be used outside the loop",
827 "ValueUsedOutsideLoop", &I);
833 if (!PrimaryInduction) {
834 if (Inductions.empty()) {
835 reportVectorizationFailure("Did not find one integer induction var",
836 "loop induction variable could not be identified",
837 "NoInductionVariable");
839 } else if (!WidestIndTy) {
840 reportVectorizationFailure("Did not find one integer induction var",
841 "integer loop induction variable could not be identified",
842 "NoIntegerInductionVariable");
845 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
849 // Now we know the widest induction type, check if our found induction
850 // is the same size. If it's not, unset it here and InnerLoopVectorizer
851 // will create another.
852 if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
853 PrimaryInduction = nullptr;
858 bool LoopVectorizationLegality::canVectorizeMemory() {
859 LAI = &(*GetLAA)(*TheLoop);
860 const OptimizationRemarkAnalysis *LAR = LAI->getReport();
863 return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
864 "loop not vectorized: ", *LAR);
867 if (!LAI->canVectorizeMemory())
870 if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
871 reportVectorizationFailure("Stores to a uniform address",
872 "write to a loop invariant address could not be vectorized",
873 "CantVectorizeStoreToLoopInvariantAddress");
876 Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
877 PSE.addPredicate(LAI->getPSE().getUnionPredicate());
882 bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
883 Value *In0 = const_cast<Value *>(V);
884 PHINode *PN = dyn_cast_or_null<PHINode>(In0);
888 return Inductions.count(PN);
891 bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
892 auto *Inst = dyn_cast<Instruction>(V);
893 return (Inst && InductionCastsToIgnore.count(Inst));
896 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
897 return isInductionPhi(V) || isCastedInductionVariable(V);
900 bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
901 return FirstOrderRecurrences.count(Phi);
904 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
905 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
908 bool LoopVectorizationLegality::blockCanBePredicated(
909 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
910 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
912 for (Instruction &I : *BB) {
913 // Check that we don't have a constant expression that can trap as operand.
914 for (Value *Operand : I.operands()) {
915 if (auto *C = dyn_cast<Constant>(Operand))
919 // We might be able to hoist the load.
920 if (I.mayReadFromMemory()) {
921 auto *LI = dyn_cast<LoadInst>(&I);
924 if (!SafePtrs.count(LI->getPointerOperand())) {
925 // !llvm.mem.parallel_loop_access implies if-conversion safety.
926 // Otherwise, record that the load needs (real or emulated) masking
927 // and let the cost model decide.
928 if (!IsAnnotatedParallel)
934 if (I.mayWriteToMemory()) {
935 auto *SI = dyn_cast<StoreInst>(&I);
938 // Predicated store requires some form of masking:
939 // 1) masked store HW instruction,
940 // 2) emulation via load-blend-store (only if safe and legal to do so,
941 // be aware on the race conditions), or
942 // 3) element-by-element predicate check and scalar store.
953 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
954 if (!EnableIfConversion) {
955 reportVectorizationFailure("If-conversion is disabled",
956 "if-conversion is disabled",
957 "IfConversionDisabled");
961 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
963 // A list of pointers that we can safely read and write to.
964 SmallPtrSet<Value *, 8> SafePointes;
966 // Collect safe addresses.
967 for (BasicBlock *BB : TheLoop->blocks()) {
968 if (blockNeedsPredication(BB))
971 for (Instruction &I : *BB)
972 if (auto *Ptr = getLoadStorePointerOperand(&I))
973 SafePointes.insert(Ptr);
976 // Collect the blocks that need predication.
977 BasicBlock *Header = TheLoop->getHeader();
978 for (BasicBlock *BB : TheLoop->blocks()) {
979 // We don't support switch statements inside loops.
980 if (!isa<BranchInst>(BB->getTerminator())) {
981 reportVectorizationFailure("Loop contains a switch statement",
982 "loop contains a switch statement",
983 "LoopContainsSwitch", BB->getTerminator());
987 // We must be able to predicate all blocks that need to be predicated.
988 if (blockNeedsPredication(BB)) {
989 if (!blockCanBePredicated(BB, SafePointes)) {
990 reportVectorizationFailure(
991 "Control flow cannot be substituted for a select",
992 "control flow cannot be substituted for a select",
993 "NoCFGForSelect", BB->getTerminator());
996 } else if (BB != Header && !canIfConvertPHINodes(BB)) {
997 reportVectorizationFailure(
998 "Control flow cannot be substituted for a select",
999 "control flow cannot be substituted for a select",
1000 "NoCFGForSelect", BB->getTerminator());
1005 // We can if-convert this loop.
1009 // Helper function to canVectorizeLoopNestCFG.
1010 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1011 bool UseVPlanNativePath) {
1012 assert((UseVPlanNativePath || Lp->empty()) &&
1013 "VPlan-native path is not enabled.");
1015 // TODO: ORE should be improved to show more accurate information when an
1016 // outer loop can't be vectorized because a nested loop is not understood or
1017 // legal. Something like: "outer_loop_location: loop not vectorized:
1018 // (inner_loop_location) loop control flow is not understood by vectorizer".
1020 // Store the result and return it at the end instead of exiting early, in case
1021 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1023 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1025 // We must have a loop in canonical form. Loops with indirectbr in them cannot
1026 // be canonicalized.
1027 if (!Lp->getLoopPreheader()) {
1028 reportVectorizationFailure("Loop doesn't have a legal pre-header",
1029 "loop control flow is not understood by vectorizer",
1030 "CFGNotUnderstood");
1031 if (DoExtraAnalysis)
1037 // We must have a single backedge.
1038 if (Lp->getNumBackEdges() != 1) {
1039 reportVectorizationFailure("The loop must have a single backedge",
1040 "loop control flow is not understood by vectorizer",
1041 "CFGNotUnderstood");
1042 if (DoExtraAnalysis)
1048 // We must have a single exiting block.
1049 if (!Lp->getExitingBlock()) {
1050 reportVectorizationFailure("The loop must have an exiting block",
1051 "loop control flow is not understood by vectorizer",
1052 "CFGNotUnderstood");
1053 if (DoExtraAnalysis)
1059 // We only handle bottom-tested loops, i.e. loop in which the condition is
1060 // checked at the end of each iteration. With that we can assume that all
1061 // instructions in the loop are executed the same number of times.
1062 if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
1063 reportVectorizationFailure("The exiting block is not the loop latch",
1064 "loop control flow is not understood by vectorizer",
1065 "CFGNotUnderstood");
1066 if (DoExtraAnalysis)
1075 bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1076 Loop *Lp, bool UseVPlanNativePath) {
1077 // Store the result and return it at the end instead of exiting early, in case
1078 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1080 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1081 if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1082 if (DoExtraAnalysis)
1088 // Recursively check whether the loop control flow of nested loops is
1090 for (Loop *SubLp : *Lp)
1091 if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
1092 if (DoExtraAnalysis)
1101 bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1102 // Store the result and return it at the end instead of exiting early, in case
1103 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1106 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1107 // Check whether the loop-related control flow in the loop nest is expected by
1109 if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
1110 if (DoExtraAnalysis)
1116 // We need to have a loop header.
1117 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1120 // Specific checks for outer loops. We skip the remaining legal checks at this
1121 // point because they don't support outer loops.
1122 if (!TheLoop->empty()) {
1123 assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1125 if (!canVectorizeOuterLoop()) {
1126 reportVectorizationFailure("Unsupported outer loop",
1127 "unsupported outer loop",
1128 "UnsupportedOuterLoop");
1129 // TODO: Implement DoExtraAnalysis when subsequent legal checks support
1134 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1138 assert(TheLoop->empty() && "Inner loop expected.");
1139 // Check if we can if-convert non-single-bb loops.
1140 unsigned NumBlocks = TheLoop->getNumBlocks();
1141 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
1142 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1143 if (DoExtraAnalysis)
1149 // Check if we can vectorize the instructions and CFG in this loop.
1150 if (!canVectorizeInstrs()) {
1151 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1152 if (DoExtraAnalysis)
1158 // Go over each instruction and look at memory deps.
1159 if (!canVectorizeMemory()) {
1160 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1161 if (DoExtraAnalysis)
1167 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1168 << (LAI->getRuntimePointerChecking()->Need
1169 ? " (with a runtime bound check)"
1173 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
1174 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
1175 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
1177 if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
1178 reportVectorizationFailure("Too many SCEV checks needed",
1179 "Too many SCEV assumptions need to be made and checked at runtime",
1180 "TooManySCEVRunTimeChecks");
1181 if (DoExtraAnalysis)
1187 // Okay! We've done all the tests. If any have failed, return false. Otherwise
1188 // we can vectorize, and at this point we don't have any other mem analysis
1189 // which may limit our maximum vectorization factor, so just return true with
1194 bool LoopVectorizationLegality::canFoldTailByMasking() {
1196 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1198 if (!PrimaryInduction) {
1199 reportVectorizationFailure(
1200 "No primary induction, cannot fold tail by masking",
1201 "Missing a primary induction variable in the loop, which is "
1202 "needed in order to fold tail by masking as required.",
1203 "NoPrimaryInduction");
1207 // TODO: handle reductions when tail is folded by masking.
1208 if (!Reductions.empty()) {
1209 reportVectorizationFailure(
1210 "Loop has reductions, cannot fold tail by masking",
1211 "Cannot fold tail by masking in the presence of reductions.",
1212 "ReductionFoldingTailByMasking");
1216 // TODO: handle outside users when tail is folded by masking.
1217 for (auto *AE : AllowedExit) {
1218 // Check that all users of allowed exit values are inside the loop.
1219 for (User *U : AE->users()) {
1220 Instruction *UI = cast<Instruction>(U);
1221 if (TheLoop->contains(UI))
1223 reportVectorizationFailure(
1224 "Cannot fold tail by masking, loop has an outside user for",
1225 "Cannot fold tail by masking in the presence of live outs.",
1226 "LiveOutFoldingTailByMasking", UI);
1231 // The list of pointers that we can safely read and write to remains empty.
1232 SmallPtrSet<Value *, 8> SafePointers;
1234 // Check and mark all blocks for predication, including those that ordinarily
1235 // do not need predication such as the header block.
1236 for (BasicBlock *BB : TheLoop->blocks()) {
1237 if (!blockCanBePredicated(BB, SafePointers)) {
1238 reportVectorizationFailure(
1239 "Cannot fold tail by masking as required",
1240 "control flow cannot be substituted for a select",
1241 "NoCFGForSelect", BB->getTerminator());
1246 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");