1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
14 //===----------------------------------------------------------------------===//
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/Analysis/TargetTransformInfoImpl.h"
28 #include "llvm/CodeGen/ISDOpcodes.h"
29 #include "llvm/CodeGen/TargetLowering.h"
30 #include "llvm/CodeGen/TargetSubtargetInfo.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/Constant.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/DataLayout.h"
36 #include "llvm/IR/DerivedTypes.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Operator.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/IR/Value.h"
44 #include "llvm/Support/Casting.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/MachineValueType.h"
48 #include "llvm/Support/MathExtras.h"
49 #include "llvm/Target/TargetMachine.h"
61 class ScalarEvolution;
65 extern cl::opt<unsigned> PartialUnrollingThreshold;
67 /// Base class which can be used to help build a TTI implementation.
69 /// This class provides as much implementation of the TTI interface as is
70 /// possible using the target independent parts of the code generator.
72 /// In order to subclass it, your class must implement a getST() method to
73 /// return the subtarget, and a getTLI() method to return the target lowering.
74 /// We need these methods implemented in the derived class so that this class
75 /// doesn't have to duplicate storage for them.
77 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
79 using BaseT = TargetTransformInfoImplCRTPBase<T>;
80 using TTI = TargetTransformInfo;
82 /// Helper function to access this as a T.
83 T *thisT() { return static_cast<T *>(this); }
85 /// Estimate a cost of Broadcast as an extract and sequence of insert
87 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
88 InstructionCost Cost = 0;
89 // Broadcast cost is equal to the cost of extracting the zero'th element
90 // plus the cost of inserting it into every element of the result vector.
91 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
93 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
94 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
99 /// Estimate a cost of shuffle as a sequence of extract and insert
101 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
102 InstructionCost Cost = 0;
103 // Shuffle cost is equal to the cost of extracting element from its argument
104 // plus the cost of inserting them onto the result vector.
106 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
107 // index 0 of first vector, index 1 of second vector,index 2 of first
108 // vector and finally index 3 of second vector and insert them at index
109 // <0,1,2,3> of result vector.
110 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
111 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
112 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
117 /// Estimate a cost of subvector extraction as a sequence of extract and
118 /// insert operations.
119 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
120 FixedVectorType *SubVTy) {
121 assert(VTy && SubVTy &&
122 "Can only extract subvectors from vectors");
123 int NumSubElts = SubVTy->getNumElements();
124 assert((!isa<FixedVectorType>(VTy) ||
125 (Index + NumSubElts) <=
126 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
127 "SK_ExtractSubvector index out of range");
129 InstructionCost Cost = 0;
130 // Subvector extraction cost is equal to the cost of extracting element from
131 // the source type plus the cost of inserting them into the result vector
133 for (int i = 0; i != NumSubElts; ++i) {
134 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
137 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
142 /// Estimate a cost of subvector insertion as a sequence of extract and
143 /// insert operations.
144 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
145 FixedVectorType *SubVTy) {
146 assert(VTy && SubVTy &&
147 "Can only insert subvectors into vectors");
148 int NumSubElts = SubVTy->getNumElements();
149 assert((!isa<FixedVectorType>(VTy) ||
150 (Index + NumSubElts) <=
151 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
152 "SK_InsertSubvector index out of range");
154 InstructionCost Cost = 0;
155 // Subvector insertion cost is equal to the cost of extracting element from
156 // the source type plus the cost of inserting them into the result vector
158 for (int i = 0; i != NumSubElts; ++i) {
160 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
161 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
167 /// Local query method delegates up to T which *must* implement this!
168 const TargetSubtargetInfo *getST() const {
169 return static_cast<const T *>(this)->getST();
172 /// Local query method delegates up to T which *must* implement this!
173 const TargetLoweringBase *getTLI() const {
174 return static_cast<const T *>(this)->getTLI();
177 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
179 case TTI::MIM_Unindexed:
180 return ISD::UNINDEXED;
181 case TTI::MIM_PreInc:
183 case TTI::MIM_PreDec:
185 case TTI::MIM_PostInc:
186 return ISD::POST_INC;
187 case TTI::MIM_PostDec:
188 return ISD::POST_DEC;
190 llvm_unreachable("Unexpected MemIndexedMode");
193 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
196 bool IsGatherScatter,
197 TTI::TargetCostKind CostKind) {
198 auto *VT = cast<FixedVectorType>(DataTy);
199 // Assume the target does not have support for gather/scatter operations
200 // and provide a rough estimate.
202 // First, compute the cost of the individual memory operations.
203 InstructionCost AddrExtractCost =
205 ? getVectorInstrCost(Instruction::ExtractElement,
206 FixedVectorType::get(
207 PointerType::get(VT->getElementType(), 0),
208 VT->getNumElements()),
211 InstructionCost LoadCost =
212 VT->getNumElements() *
214 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
216 // Next, compute the cost of packing the result in a vector.
217 InstructionCost PackingCost = getScalarizationOverhead(
218 VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
220 InstructionCost ConditionalCost = 0;
222 // Compute the cost of conditionally executing the memory operations with
223 // variable masks. This includes extracting the individual conditions, a
224 // branches and PHIs to combine the results.
225 // NOTE: Estimating the cost of conditionally executing the memory
226 // operations accurately is quite difficult and the current solution
227 // provides a very rough estimate only.
229 VT->getNumElements() *
231 Instruction::ExtractElement,
232 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
233 VT->getNumElements()),
235 getCFInstrCost(Instruction::Br, CostKind) +
236 getCFInstrCost(Instruction::PHI, CostKind));
239 return LoadCost + PackingCost + ConditionalCost;
243 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
245 virtual ~BasicTTIImplBase() = default;
247 using TargetTransformInfoImplBase::DL;
250 /// \name Scalar TTI Implementations
252 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
253 unsigned AddressSpace, Align Alignment,
255 EVT E = EVT::getIntegerVT(Context, BitWidth);
256 return getTLI()->allowsMisalignedMemoryAccesses(
257 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
260 bool hasBranchDivergence() { return false; }
262 bool useGPUDivergenceAnalysis() { return false; }
264 bool isSourceOfDivergence(const Value *V) { return false; }
266 bool isAlwaysUniform(const Value *V) { return false; }
268 unsigned getFlatAddressSpace() {
269 // Return an invalid address space.
273 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
274 Intrinsic::ID IID) const {
278 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
279 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
282 unsigned getAssumedAddrSpace(const Value *V) const {
283 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
286 std::pair<const Value *, unsigned>
287 getPredicatedAddrSpace(const Value *V) const {
288 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
291 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
296 bool isLegalAddImmediate(int64_t imm) {
297 return getTLI()->isLegalAddImmediate(imm);
300 bool isLegalICmpImmediate(int64_t imm) {
301 return getTLI()->isLegalICmpImmediate(imm);
304 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
305 bool HasBaseReg, int64_t Scale,
306 unsigned AddrSpace, Instruction *I = nullptr) {
307 TargetLoweringBase::AddrMode AM;
309 AM.BaseOffs = BaseOffset;
310 AM.HasBaseReg = HasBaseReg;
312 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
315 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
316 const DataLayout &DL) const {
317 EVT VT = getTLI()->getValueType(DL, Ty);
318 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
321 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
322 const DataLayout &DL) const {
323 EVT VT = getTLI()->getValueType(DL, Ty);
324 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
327 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
328 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
331 bool isNumRegsMajorCostOfLSR() {
332 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
335 bool isProfitableLSRChainElement(Instruction *I) {
336 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
339 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
340 int64_t BaseOffset, bool HasBaseReg,
341 int64_t Scale, unsigned AddrSpace) {
342 TargetLoweringBase::AddrMode AM;
344 AM.BaseOffs = BaseOffset;
345 AM.HasBaseReg = HasBaseReg;
347 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
350 bool isTruncateFree(Type *Ty1, Type *Ty2) {
351 return getTLI()->isTruncateFree(Ty1, Ty2);
354 bool isProfitableToHoist(Instruction *I) {
355 return getTLI()->isProfitableToHoist(I);
358 bool useAA() const { return getST()->useAA(); }
360 bool isTypeLegal(Type *Ty) {
361 EVT VT = getTLI()->getValueType(DL, Ty);
362 return getTLI()->isTypeLegal(VT);
365 InstructionCost getRegUsageForType(Type *Ty) {
366 InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
367 assert(Val >= 0 && "Negative cost!");
371 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
372 ArrayRef<const Value *> Operands,
373 TTI::TargetCostKind CostKind) {
374 return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind);
377 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
378 unsigned &JumpTableSize,
379 ProfileSummaryInfo *PSI,
380 BlockFrequencyInfo *BFI) {
381 /// Try to find the estimated number of clusters. Note that the number of
382 /// clusters identified in this function could be different from the actual
383 /// numbers found in lowering. This function ignore switches that are
384 /// lowered with a mix of jump table / bit test / BTree. This function was
385 /// initially intended to be used when estimating the cost of switch in
386 /// inline cost heuristic, but it's a generic cost model to be used in other
387 /// places (e.g., in loop unrolling).
388 unsigned N = SI.getNumCases();
389 const TargetLoweringBase *TLI = getTLI();
390 const DataLayout &DL = this->getDataLayout();
393 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
395 // Early exit if both a jump table and bit test are not allowed.
396 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
399 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
400 APInt MinCaseVal = MaxCaseVal;
401 for (auto CI : SI.cases()) {
402 const APInt &CaseVal = CI.getCaseValue()->getValue();
403 if (CaseVal.sgt(MaxCaseVal))
404 MaxCaseVal = CaseVal;
405 if (CaseVal.slt(MinCaseVal))
406 MinCaseVal = CaseVal;
409 // Check if suitable for a bit test
410 if (N <= DL.getIndexSizeInBits(0u)) {
411 SmallPtrSet<const BasicBlock *, 4> Dests;
412 for (auto I : SI.cases())
413 Dests.insert(I.getCaseSuccessor());
415 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
420 // Check if suitable for a jump table.
422 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
425 (MaxCaseVal - MinCaseVal)
426 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
427 // Check whether a range of clusters is dense enough for a jump table
428 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
429 JumpTableSize = Range;
436 bool shouldBuildLookupTables() {
437 const TargetLoweringBase *TLI = getTLI();
438 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
439 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
442 bool shouldBuildRelLookupTables() const {
443 const TargetMachine &TM = getTLI()->getTargetMachine();
444 // If non-PIC mode, do not generate a relative lookup table.
445 if (!TM.isPositionIndependent())
448 /// Relative lookup table entries consist of 32-bit offsets.
449 /// Do not generate relative lookup tables for large code models
450 /// in 64-bit achitectures where 32-bit offsets might not be enough.
451 if (TM.getCodeModel() == CodeModel::Medium ||
452 TM.getCodeModel() == CodeModel::Large)
455 Triple TargetTriple = TM.getTargetTriple();
456 if (!TargetTriple.isArch64Bit())
459 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
461 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
467 bool haveFastSqrt(Type *Ty) {
468 const TargetLoweringBase *TLI = getTLI();
469 EVT VT = TLI->getValueType(DL, Ty);
470 return TLI->isTypeLegal(VT) &&
471 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
474 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
478 InstructionCost getFPOpCost(Type *Ty) {
479 // Check whether FADD is available, as a proxy for floating-point in
481 const TargetLoweringBase *TLI = getTLI();
482 EVT VT = TLI->getValueType(DL, Ty);
483 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
484 return TargetTransformInfo::TCC_Basic;
485 return TargetTransformInfo::TCC_Expensive;
488 unsigned getInliningThresholdMultiplier() { return 1; }
489 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
491 int getInlinerVectorBonusPercent() { return 150; }
493 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
494 TTI::UnrollingPreferences &UP,
495 OptimizationRemarkEmitter *ORE) {
496 // This unrolling functionality is target independent, but to provide some
497 // motivation for its intended use, for x86:
499 // According to the Intel 64 and IA-32 Architectures Optimization Reference
500 // Manual, Intel Core models and later have a loop stream detector (and
501 // associated uop queue) that can benefit from partial unrolling.
502 // The relevant requirements are:
503 // - The loop must have no more than 4 (8 for Nehalem and later) branches
504 // taken, and none of them may be calls.
505 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
507 // According to the Software Optimization Guide for AMD Family 15h
508 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
509 // and loop buffer which can benefit from partial unrolling.
510 // The relevant requirements are:
511 // - The loop must have fewer than 16 branches
512 // - The loop must have less than 40 uops in all executed loop branches
514 // The number of taken branches in a loop is hard to estimate here, and
515 // benchmarking has revealed that it is better not to be conservative when
516 // estimating the branch count. As a result, we'll ignore the branch limits
517 // until someone finds a case where it matters in practice.
520 const TargetSubtargetInfo *ST = getST();
521 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
522 MaxOps = PartialUnrollingThreshold;
523 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
524 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
528 // Scan the loop: don't unroll loops with calls.
529 for (BasicBlock *BB : L->blocks()) {
530 for (Instruction &I : *BB) {
531 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
532 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
533 if (!thisT()->isLoweredToCall(F))
539 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
541 << "advising against unrolling the loop because it "
543 << ore::NV("Call", &I);
551 // Enable runtime and partial unrolling up to the specified size.
552 // Enable using trip count upper bound to unroll loops.
553 UP.Partial = UP.Runtime = UP.UpperBound = true;
554 UP.PartialThreshold = MaxOps;
556 // Avoid unrolling when optimizing for size.
557 UP.OptSizeThreshold = 0;
558 UP.PartialOptSizeThreshold = 0;
560 // Set number of instructions optimized when "back edge"
561 // becomes "fall through" to default value of 2.
565 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
566 TTI::PeelingPreferences &PP) {
568 PP.AllowPeeling = true;
569 PP.AllowLoopNestsPeeling = false;
570 PP.PeelProfiledIterations = true;
573 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
575 TargetLibraryInfo *LibInfo,
576 HardwareLoopInfo &HWLoopInfo) {
577 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
580 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
581 AssumptionCache &AC, TargetLibraryInfo *TLI,
583 const LoopAccessInfo *LAI) {
584 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
587 bool emitGetActiveLaneMask() {
588 return BaseT::emitGetActiveLaneMask();
591 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
593 return BaseT::instCombineIntrinsic(IC, II);
596 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
600 bool &KnownBitsComputed) {
601 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
605 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
606 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
607 APInt &UndefElts2, APInt &UndefElts3,
608 std::function<void(Instruction *, unsigned, APInt, APInt &)>
610 return BaseT::simplifyDemandedVectorEltsIntrinsic(
611 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
615 InstructionCost getInstructionLatency(const Instruction *I) {
616 if (isa<LoadInst>(I))
617 return getST()->getSchedModel().DefaultLoadLatency;
619 return BaseT::getInstructionLatency(I);
622 virtual Optional<unsigned>
623 getCacheSize(TargetTransformInfo::CacheLevel Level) const {
624 return Optional<unsigned>(
625 getST()->getCacheSize(static_cast<unsigned>(Level)));
628 virtual Optional<unsigned>
629 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
630 Optional<unsigned> TargetResult =
631 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
636 return BaseT::getCacheAssociativity(Level);
639 virtual unsigned getCacheLineSize() const {
640 return getST()->getCacheLineSize();
643 virtual unsigned getPrefetchDistance() const {
644 return getST()->getPrefetchDistance();
647 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
648 unsigned NumStridedMemAccesses,
649 unsigned NumPrefetches,
650 bool HasCall) const {
651 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
652 NumPrefetches, HasCall);
655 virtual unsigned getMaxPrefetchIterationsAhead() const {
656 return getST()->getMaxPrefetchIterationsAhead();
659 virtual bool enableWritePrefetching() const {
660 return getST()->enableWritePrefetching();
665 /// \name Vector TTI Implementations
668 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
669 return TypeSize::getFixed(32);
672 Optional<unsigned> getMaxVScale() const { return None; }
673 Optional<unsigned> getVScaleForTuning() const { return None; }
675 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
676 /// are set if the demanded result elements need to be inserted and/or
677 /// extracted from vectors.
678 InstructionCost getScalarizationOverhead(VectorType *InTy,
679 const APInt &DemandedElts,
680 bool Insert, bool Extract) {
681 /// FIXME: a bitfield is not a reasonable abstraction for talking about
682 /// which elements are needed from a scalable vector
683 auto *Ty = cast<FixedVectorType>(InTy);
685 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
686 "Vector size mismatch");
688 InstructionCost Cost = 0;
690 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
691 if (!DemandedElts[i])
694 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
696 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
702 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
703 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
705 auto *Ty = cast<FixedVectorType>(InTy);
707 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
708 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
711 /// Estimate the overhead of scalarizing an instructions unique
712 /// non-constant operands. The (potentially vector) types to use for each of
713 /// argument are passes via Tys.
714 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
715 ArrayRef<Type *> Tys) {
716 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
718 InstructionCost Cost = 0;
719 SmallPtrSet<const Value*, 4> UniqueOperands;
720 for (int I = 0, E = Args.size(); I != E; I++) {
721 // Disregard things like metadata arguments.
722 const Value *A = Args[I];
724 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
725 !Ty->isPtrOrPtrVectorTy())
728 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
729 if (auto *VecTy = dyn_cast<VectorType>(Ty))
730 Cost += getScalarizationOverhead(VecTy, false, true);
737 /// Estimate the overhead of scalarizing the inputs and outputs of an
738 /// instruction, with return type RetTy and arguments Args of type Tys. If
739 /// Args are unknown (empty), then the cost associated with one argument is
740 /// added as a heuristic.
741 InstructionCost getScalarizationOverhead(VectorType *RetTy,
742 ArrayRef<const Value *> Args,
743 ArrayRef<Type *> Tys) {
744 InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
746 Cost += getOperandsScalarizationOverhead(Args, Tys);
748 // When no information on arguments is provided, we add the cost
749 // associated with one argument as a heuristic.
750 Cost += getScalarizationOverhead(RetTy, false, true);
755 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
757 InstructionCost getArithmeticInstrCost(
758 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
759 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
760 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
761 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
762 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
763 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
764 const Instruction *CxtI = nullptr) {
765 // Check if any of the operands are vector operands.
766 const TargetLoweringBase *TLI = getTLI();
767 int ISD = TLI->InstructionOpcodeToISD(Opcode);
768 assert(ISD && "Invalid opcode");
770 // TODO: Handle more cost kinds.
771 if (CostKind != TTI::TCK_RecipThroughput)
772 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
774 Opd1PropInfo, Opd2PropInfo,
777 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
779 bool IsFloat = Ty->isFPOrFPVectorTy();
780 // Assume that floating point arithmetic operations cost twice as much as
781 // integer operations.
782 InstructionCost OpCost = (IsFloat ? 2 : 1);
784 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
785 // The operation is legal. Assume it costs 1.
786 // TODO: Once we have extract/insert subvector cost we need to use them.
787 return LT.first * OpCost;
790 if (!TLI->isOperationExpand(ISD, LT.second)) {
791 // If the operation is custom lowered, then assume that the code is twice
793 return LT.first * 2 * OpCost;
796 // An 'Expand' of URem and SRem is special because it may default
797 // to expanding the operation into a sequence of sub-operations
798 // i.e. X % Y -> X-(X/Y)*Y.
799 if (ISD == ISD::UREM || ISD == ISD::SREM) {
800 bool IsSigned = ISD == ISD::SREM;
801 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
803 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
805 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
806 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
807 DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo,
809 InstructionCost MulCost =
810 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
811 InstructionCost SubCost =
812 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
813 return DivCost + MulCost + SubCost;
817 // We cannot scalarize scalable vectors, so return Invalid.
818 if (isa<ScalableVectorType>(Ty))
819 return InstructionCost::getInvalid();
821 // Else, assume that we need to scalarize this op.
822 // TODO: If one of the types get legalized by splitting, handle this
823 // similarly to what getCastInstrCost() does.
824 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
825 InstructionCost Cost = thisT()->getArithmeticInstrCost(
826 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
827 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
828 // Return the cost of multiple scalar invocation plus the cost of
829 // inserting and extracting the values.
830 SmallVector<Type *> Tys(Args.size(), Ty);
831 return getScalarizationOverhead(VTy, Args, Tys) +
832 VTy->getNumElements() * Cost;
835 // We don't know anything about this scalar instruction.
839 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
840 ArrayRef<int> Mask) const {
841 int Limit = Mask.size() * 2;
843 // Extra check required by isSingleSourceMaskImpl function (called by
844 // ShuffleVectorInst::isSingleSourceMask).
845 any_of(Mask, [Limit](int I) { return I >= Limit; }))
848 case TTI::SK_PermuteSingleSrc:
849 if (ShuffleVectorInst::isReverseMask(Mask))
850 return TTI::SK_Reverse;
851 if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
852 return TTI::SK_Broadcast;
854 case TTI::SK_PermuteTwoSrc:
855 if (ShuffleVectorInst::isSelectMask(Mask))
856 return TTI::SK_Select;
857 if (ShuffleVectorInst::isTransposeMask(Mask))
858 return TTI::SK_Transpose;
861 case TTI::SK_Reverse:
862 case TTI::SK_Broadcast:
863 case TTI::SK_Transpose:
864 case TTI::SK_InsertSubvector:
865 case TTI::SK_ExtractSubvector:
872 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
873 ArrayRef<int> Mask, int Index,
876 switch (improveShuffleKindFromMask(Kind, Mask)) {
877 case TTI::SK_Broadcast:
878 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
881 case TTI::SK_Reverse:
882 case TTI::SK_Transpose:
883 case TTI::SK_PermuteSingleSrc:
884 case TTI::SK_PermuteTwoSrc:
885 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
886 case TTI::SK_ExtractSubvector:
887 return getExtractSubvectorOverhead(Tp, Index,
888 cast<FixedVectorType>(SubTp));
889 case TTI::SK_InsertSubvector:
890 return getInsertSubvectorOverhead(Tp, Index,
891 cast<FixedVectorType>(SubTp));
893 llvm_unreachable("Unknown TTI::ShuffleKind");
896 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
897 TTI::CastContextHint CCH,
898 TTI::TargetCostKind CostKind,
899 const Instruction *I = nullptr) {
900 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
903 const TargetLoweringBase *TLI = getTLI();
904 int ISD = TLI->InstructionOpcodeToISD(Opcode);
905 assert(ISD && "Invalid opcode");
906 std::pair<InstructionCost, MVT> SrcLT =
907 TLI->getTypeLegalizationCost(DL, Src);
908 std::pair<InstructionCost, MVT> DstLT =
909 TLI->getTypeLegalizationCost(DL, Dst);
911 TypeSize SrcSize = SrcLT.second.getSizeInBits();
912 TypeSize DstSize = DstLT.second.getSizeInBits();
913 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
914 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
919 case Instruction::Trunc:
920 // Check for NOOP conversions.
921 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
924 case Instruction::BitCast:
925 // Bitcast between types that are legalized to the same type are free and
926 // assume int to/from ptr of the same size is also free.
927 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
931 case Instruction::FPExt:
932 if (I && getTLI()->isExtFree(I))
935 case Instruction::ZExt:
936 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
939 case Instruction::SExt:
940 if (I && getTLI()->isExtFree(I))
943 // If this is a zext/sext of a load, return 0 if the corresponding
944 // extending load exists on target and the result type is legal.
945 if (CCH == TTI::CastContextHint::Normal) {
946 EVT ExtVT = EVT::getEVT(Dst);
947 EVT LoadVT = EVT::getEVT(Src);
949 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
950 if (DstLT.first == SrcLT.first &&
951 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
955 case Instruction::AddrSpaceCast:
956 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
957 Dst->getPointerAddressSpace()))
962 auto *SrcVTy = dyn_cast<VectorType>(Src);
963 auto *DstVTy = dyn_cast<VectorType>(Dst);
965 // If the cast is marked as legal (or promote) then assume low cost.
966 if (SrcLT.first == DstLT.first &&
967 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
970 // Handle scalar conversions.
971 if (!SrcVTy && !DstVTy) {
972 // Just check the op cost. If the operation is legal then assume it costs
974 if (!TLI->isOperationExpand(ISD, DstLT.second))
977 // Assume that illegal scalar instruction are expensive.
981 // Check vector-to-vector casts.
982 if (DstVTy && SrcVTy) {
983 // If the cast is between same-sized registers, then the check is simple.
984 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
986 // Assume that Zext is done using AND.
987 if (Opcode == Instruction::ZExt)
990 // Assume that sext is done using SHL and SRA.
991 if (Opcode == Instruction::SExt)
992 return SrcLT.first * 2;
994 // Just check the op cost. If the operation is legal then assume it
996 // 1 and multiply by the type-legalization overhead.
997 if (!TLI->isOperationExpand(ISD, DstLT.second))
998 return SrcLT.first * 1;
1001 // If we are legalizing by splitting, query the concrete TTI for the cost
1002 // of casting the original vector twice. We also need to factor in the
1003 // cost of the split itself. Count that as 1, to be consistent with
1004 // TLI->getTypeLegalizationCost().
1006 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1007 TargetLowering::TypeSplitVector;
1009 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1010 TargetLowering::TypeSplitVector;
1011 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1012 DstVTy->getElementCount().isVector()) {
1013 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1014 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1015 T *TTI = static_cast<T *>(this);
1016 // If both types need to be split then the split is free.
1017 InstructionCost SplitCost =
1018 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1020 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1024 // Scalarization cost is Invalid, can't assume any num elements.
1025 if (isa<ScalableVectorType>(DstVTy))
1026 return InstructionCost::getInvalid();
1028 // In other cases where the source or destination are illegal, assume
1029 // the operation will get scalarized.
1030 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1031 InstructionCost Cost = thisT()->getCastInstrCost(
1032 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1034 // Return the cost of multiple scalar invocation plus the cost of
1035 // inserting and extracting the values.
1036 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
1039 // We already handled vector-to-vector and scalar-to-scalar conversions.
1041 // is where we handle bitcast between vectors and scalars. We need to assume
1042 // that the conversion is scalarized in one way or another.
1043 if (Opcode == Instruction::BitCast) {
1044 // Illegal bitcasts are done by storing and loading from a stack slot.
1045 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
1046 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
1049 llvm_unreachable("Unhandled cast");
1052 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1053 VectorType *VecTy, unsigned Index) {
1054 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1056 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1057 TTI::CastContextHint::None,
1058 TTI::TCK_RecipThroughput);
1061 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1062 const Instruction *I = nullptr) {
1063 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1066 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1067 CmpInst::Predicate VecPred,
1068 TTI::TargetCostKind CostKind,
1069 const Instruction *I = nullptr) {
1070 const TargetLoweringBase *TLI = getTLI();
1071 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1072 assert(ISD && "Invalid opcode");
1074 // TODO: Handle other cost kinds.
1075 if (CostKind != TTI::TCK_RecipThroughput)
1076 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1079 // Selects on vectors are actually vector selects.
1080 if (ISD == ISD::SELECT) {
1081 assert(CondTy && "CondTy must exist");
1082 if (CondTy->isVectorTy())
1085 std::pair<InstructionCost, MVT> LT =
1086 TLI->getTypeLegalizationCost(DL, ValTy);
1088 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1089 !TLI->isOperationExpand(ISD, LT.second)) {
1090 // The operation is legal. Assume it costs 1. Multiply
1091 // by the type-legalization overhead.
1092 return LT.first * 1;
1095 // Otherwise, assume that the cast is scalarized.
1096 // TODO: If one of the types get legalized by splitting, handle this
1097 // similarly to what getCastInstrCost() does.
1098 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1099 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1101 CondTy = CondTy->getScalarType();
1102 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1103 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1105 // Return the cost of multiple scalar invocation plus the cost of
1106 // inserting and extracting the values.
1107 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1110 // Unknown scalar opcode.
1114 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1116 std::pair<InstructionCost, MVT> LT =
1117 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1122 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1124 const APInt &DemandedDstElts,
1125 TTI::TargetCostKind CostKind) {
1126 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1127 "Unexpected size of DemandedDstElts.");
1129 InstructionCost Cost;
1131 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1132 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1134 // The Mask shuffling cost is extract all the elements of the Mask
1135 // and insert each of them Factor times into the wide vector:
1137 // E.g. an interleaved group with factor 3:
1138 // %mask = icmp ult <8 x i32> %vec1, %vec2
1139 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1140 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1141 // The cost is estimated as extract all mask elements from the <8xi1> mask
1142 // vector and insert them factor times into the <24xi1> shuffled mask
1144 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1145 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1149 thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1150 /*Insert*/ true, /*Extract*/ false);
1155 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1156 MaybeAlign Alignment, unsigned AddressSpace,
1157 TTI::TargetCostKind CostKind,
1158 const Instruction *I = nullptr) {
1159 assert(!Src->isVoidTy() && "Invalid type");
1160 // Assume types, such as structs, are expensive.
1161 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1163 std::pair<InstructionCost, MVT> LT =
1164 getTLI()->getTypeLegalizationCost(DL, Src);
1166 // Assuming that all loads of legal types cost 1.
1167 InstructionCost Cost = LT.first;
1168 if (CostKind != TTI::TCK_RecipThroughput)
1171 if (Src->isVectorTy() &&
1172 // In practice it's not currently possible to have a change in lane
1173 // length for extending loads or truncating stores so both types should
1174 // have the same scalable property.
1175 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
1176 LT.second.getSizeInBits())) {
1177 // This is a vector load that legalizes to a larger type than the vector
1178 // itself. Unless the corresponding extending load or truncating store is
1179 // legal, then this will scalarize.
1180 TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1181 EVT MemVT = getTLI()->getValueType(DL, Src);
1182 if (Opcode == Instruction::Store)
1183 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1185 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1187 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1188 // This is a vector load/store for some illegal type that is scalarized.
1189 // We must account for the cost of building or decomposing the vector.
1190 Cost += getScalarizationOverhead(cast<VectorType>(Src),
1191 Opcode != Instruction::Store,
1192 Opcode == Instruction::Store);
1199 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1200 Align Alignment, unsigned AddressSpace,
1201 TTI::TargetCostKind CostKind) {
1202 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1206 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1207 const Value *Ptr, bool VariableMask,
1209 TTI::TargetCostKind CostKind,
1210 const Instruction *I = nullptr) {
1211 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1215 InstructionCost getInterleavedMemoryOpCost(
1216 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1217 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1218 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1219 auto *VT = cast<FixedVectorType>(VecTy);
1221 unsigned NumElts = VT->getNumElements();
1222 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1224 unsigned NumSubElts = NumElts / Factor;
1225 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1227 // Firstly, the cost of load/store operation.
1228 InstructionCost Cost;
1229 if (UseMaskForCond || UseMaskForGaps)
1230 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1231 AddressSpace, CostKind);
1233 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1236 // Legalize the vector type, and get the legalized and unlegalized type
1238 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1239 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1240 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1242 // Scale the cost of the memory operation by the fraction of legalized
1243 // instructions that will actually be used. We shouldn't account for the
1244 // cost of dead instructions since they will be removed.
1246 // E.g., An interleaved load of factor 8:
1247 // %vec = load <16 x i64>, <16 x i64>* %ptr
1248 // %v0 = shufflevector %vec, undef, <0, 8>
1250 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1251 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1252 // type). The other loads are unused.
1254 // TODO: Note that legalization can turn masked loads/stores into unmasked
1255 // (legalized) loads/stores. This can be reflected in the cost.
1256 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1257 // The number of loads of a legal type it will take to represent a load
1258 // of the unlegalized vector type.
1259 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1261 // The number of elements of the unlegalized type that correspond to a
1262 // single legal instruction.
1263 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1265 // Determine which legal instructions will be used.
1266 BitVector UsedInsts(NumLegalInsts, false);
1267 for (unsigned Index : Indices)
1268 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1269 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1271 // Scale the cost of the load by the fraction of legal instructions that
1273 Cost = divideCeil(UsedInsts.count() * Cost.getValue().getValue(),
1277 // Then plus the cost of interleave operation.
1278 assert(Indices.size() <= Factor &&
1279 "Interleaved memory op has too many members");
1281 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1282 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1284 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1285 for (unsigned Index : Indices) {
1286 assert(Index < Factor && "Invalid index for interleaved memory op");
1287 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1288 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1291 if (Opcode == Instruction::Load) {
1292 // The interleave cost is similar to extract sub vectors' elements
1293 // from the wide vector, and insert them into sub vectors.
1295 // E.g. An interleaved load of factor 2 (with one member of index 0):
1296 // %vec = load <8 x i32>, <8 x i32>* %ptr
1297 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1298 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1299 // <8 x i32> vector and insert them into a <4 x i32> vector.
1300 InstructionCost InsSubCost =
1301 thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
1302 /*Insert*/ true, /*Extract*/ false);
1303 Cost += Indices.size() * InsSubCost;
1305 thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1306 /*Insert*/ false, /*Extract*/ true);
1308 // The interleave cost is extract elements from sub vectors, and
1309 // insert them into the wide vector.
1311 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1313 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1314 // %gaps.mask = <true, true, false, true, true, false,
1315 // true, true, false, true, true, false>
1316 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1317 // i32 Align, <12 x i1> %gaps.mask
1318 // The cost is estimated as extract all elements (of actual members,
1319 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1321 InstructionCost ExtSubCost =
1322 thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
1323 /*Insert*/ false, /*Extract*/ true);
1324 Cost += ExtSubCost * Indices.size();
1325 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1330 if (!UseMaskForCond)
1333 Type *I8Type = Type::getInt8Ty(VT->getContext());
1335 Cost += thisT()->getReplicationShuffleCost(
1336 I8Type, Factor, NumSubElts,
1337 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1340 // The Gaps mask is invariant and created outside the loop, therefore the
1341 // cost of creating it is not accounted for here. However if we have both
1342 // a MaskForGaps and some other mask that guards the execution of the
1343 // memory access, we need to account for the cost of And-ing the two masks
1345 if (UseMaskForGaps) {
1346 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1347 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1354 /// Get intrinsic cost based on arguments.
1355 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1356 TTI::TargetCostKind CostKind) {
1357 // Check for generically free intrinsics.
1358 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1361 // Assume that target intrinsics are cheap.
1362 Intrinsic::ID IID = ICA.getID();
1363 if (Function::isTargetIntrinsic(IID))
1364 return TargetTransformInfo::TCC_Basic;
1366 if (ICA.isTypeBasedOnly())
1367 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1369 Type *RetTy = ICA.getReturnType();
1371 ElementCount RetVF =
1372 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1373 : ElementCount::getFixed(1));
1374 const IntrinsicInst *I = ICA.getInst();
1375 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1376 FastMathFlags FMF = ICA.getFlags();
1381 case Intrinsic::cttz:
1382 // FIXME: If necessary, this should go in target-specific overrides.
1383 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1384 return TargetTransformInfo::TCC_Basic;
1387 case Intrinsic::ctlz:
1388 // FIXME: If necessary, this should go in target-specific overrides.
1389 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1390 return TargetTransformInfo::TCC_Basic;
1393 case Intrinsic::memcpy:
1394 return thisT()->getMemcpyCost(ICA.getInst());
1396 case Intrinsic::masked_scatter: {
1397 const Value *Mask = Args[3];
1398 bool VarMask = !isa<Constant>(Mask);
1399 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1400 return thisT()->getGatherScatterOpCost(Instruction::Store,
1401 ICA.getArgTypes()[0], Args[1],
1402 VarMask, Alignment, CostKind, I);
1404 case Intrinsic::masked_gather: {
1405 const Value *Mask = Args[2];
1406 bool VarMask = !isa<Constant>(Mask);
1407 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1408 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1409 VarMask, Alignment, CostKind, I);
1411 case Intrinsic::experimental_stepvector: {
1412 if (isa<ScalableVectorType>(RetTy))
1413 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1414 // The cost of materialising a constant integer vector.
1415 return TargetTransformInfo::TCC_Basic;
1417 case Intrinsic::experimental_vector_extract: {
1418 // FIXME: Handle case where a scalable vector is extracted from a scalable
1420 if (isa<ScalableVectorType>(RetTy))
1421 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1422 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1423 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1424 cast<VectorType>(Args[0]->getType()), None,
1425 Index, cast<VectorType>(RetTy));
1427 case Intrinsic::experimental_vector_insert: {
1428 // FIXME: Handle case where a scalable vector is inserted into a scalable
1430 if (isa<ScalableVectorType>(Args[1]->getType()))
1431 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1432 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1433 return thisT()->getShuffleCost(
1434 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1435 Index, cast<VectorType>(Args[1]->getType()));
1437 case Intrinsic::experimental_vector_reverse: {
1438 return thisT()->getShuffleCost(TTI::SK_Reverse,
1439 cast<VectorType>(Args[0]->getType()), None,
1440 0, cast<VectorType>(RetTy));
1442 case Intrinsic::experimental_vector_splice: {
1443 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1444 return thisT()->getShuffleCost(TTI::SK_Splice,
1445 cast<VectorType>(Args[0]->getType()), None,
1446 Index, cast<VectorType>(RetTy));
1448 case Intrinsic::vector_reduce_add:
1449 case Intrinsic::vector_reduce_mul:
1450 case Intrinsic::vector_reduce_and:
1451 case Intrinsic::vector_reduce_or:
1452 case Intrinsic::vector_reduce_xor:
1453 case Intrinsic::vector_reduce_smax:
1454 case Intrinsic::vector_reduce_smin:
1455 case Intrinsic::vector_reduce_fmax:
1456 case Intrinsic::vector_reduce_fmin:
1457 case Intrinsic::vector_reduce_umax:
1458 case Intrinsic::vector_reduce_umin: {
1459 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1460 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1462 case Intrinsic::vector_reduce_fadd:
1463 case Intrinsic::vector_reduce_fmul: {
1464 IntrinsicCostAttributes Attrs(
1465 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1466 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1468 case Intrinsic::fshl:
1469 case Intrinsic::fshr: {
1470 if (isa<ScalableVectorType>(RetTy))
1471 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1472 const Value *X = Args[0];
1473 const Value *Y = Args[1];
1474 const Value *Z = Args[2];
1475 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1476 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1477 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1478 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1479 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1480 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1482 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1483 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1484 InstructionCost Cost = 0;
1486 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1488 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1489 Cost += thisT()->getArithmeticInstrCost(
1490 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1491 Cost += thisT()->getArithmeticInstrCost(
1492 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1493 // Non-constant shift amounts requires a modulo.
1494 if (OpKindZ != TTI::OK_UniformConstantValue &&
1495 OpKindZ != TTI::OK_NonUniformConstantValue)
1496 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1497 CostKind, OpKindZ, OpKindBW,
1498 OpPropsZ, OpPropsBW);
1499 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1501 Type *CondTy = RetTy->getWithNewBitWidth(1);
1503 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1504 CmpInst::ICMP_EQ, CostKind);
1506 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1507 CmpInst::ICMP_EQ, CostKind);
1513 // Assume that we need to scalarize this intrinsic.
1514 // Compute the scalarization overhead based on Args for a vector
1516 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1517 if (RetVF.isVector() && !RetVF.isScalable()) {
1518 ScalarizationCost = 0;
1519 if (!RetTy->isVoidTy())
1520 ScalarizationCost +=
1521 getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1522 ScalarizationCost +=
1523 getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1526 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1528 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1531 /// Get intrinsic cost based on argument types.
1532 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1533 /// cost of scalarizing the arguments and the return value will be computed
1536 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1537 TTI::TargetCostKind CostKind) {
1538 Intrinsic::ID IID = ICA.getID();
1539 Type *RetTy = ICA.getReturnType();
1540 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1541 FastMathFlags FMF = ICA.getFlags();
1542 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1543 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1545 VectorType *VecOpTy = nullptr;
1547 // The vector reduction operand is operand 0 except for fadd/fmul.
1548 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1549 unsigned VecTyIndex = 0;
1550 if (IID == Intrinsic::vector_reduce_fadd ||
1551 IID == Intrinsic::vector_reduce_fmul)
1553 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1554 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1557 // Library call cost - other than size, make it expensive.
1558 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1559 SmallVector<unsigned, 2> ISDs;
1562 // Scalable vectors cannot be scalarized, so return Invalid.
1563 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1564 return isa<ScalableVectorType>(Ty);
1566 return InstructionCost::getInvalid();
1568 // Assume that we need to scalarize this intrinsic.
1569 InstructionCost ScalarizationCost =
1570 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1571 unsigned ScalarCalls = 1;
1572 Type *ScalarRetTy = RetTy;
1573 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1574 if (!SkipScalarizationCost)
1575 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1576 ScalarCalls = std::max(ScalarCalls,
1577 cast<FixedVectorType>(RetVTy)->getNumElements());
1578 ScalarRetTy = RetTy->getScalarType();
1580 SmallVector<Type *, 4> ScalarTys;
1581 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1583 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1584 if (!SkipScalarizationCost)
1585 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1586 ScalarCalls = std::max(ScalarCalls,
1587 cast<FixedVectorType>(VTy)->getNumElements());
1588 Ty = Ty->getScalarType();
1590 ScalarTys.push_back(Ty);
1592 if (ScalarCalls == 1)
1593 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1595 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1596 InstructionCost ScalarCost =
1597 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1599 return ScalarCalls * ScalarCost + ScalarizationCost;
1601 // Look for intrinsics that can be lowered directly or turned into a scalar
1603 case Intrinsic::sqrt:
1604 ISDs.push_back(ISD::FSQRT);
1606 case Intrinsic::sin:
1607 ISDs.push_back(ISD::FSIN);
1609 case Intrinsic::cos:
1610 ISDs.push_back(ISD::FCOS);
1612 case Intrinsic::exp:
1613 ISDs.push_back(ISD::FEXP);
1615 case Intrinsic::exp2:
1616 ISDs.push_back(ISD::FEXP2);
1618 case Intrinsic::log:
1619 ISDs.push_back(ISD::FLOG);
1621 case Intrinsic::log10:
1622 ISDs.push_back(ISD::FLOG10);
1624 case Intrinsic::log2:
1625 ISDs.push_back(ISD::FLOG2);
1627 case Intrinsic::fabs:
1628 ISDs.push_back(ISD::FABS);
1630 case Intrinsic::canonicalize:
1631 ISDs.push_back(ISD::FCANONICALIZE);
1633 case Intrinsic::minnum:
1634 ISDs.push_back(ISD::FMINNUM);
1636 case Intrinsic::maxnum:
1637 ISDs.push_back(ISD::FMAXNUM);
1639 case Intrinsic::minimum:
1640 ISDs.push_back(ISD::FMINIMUM);
1642 case Intrinsic::maximum:
1643 ISDs.push_back(ISD::FMAXIMUM);
1645 case Intrinsic::copysign:
1646 ISDs.push_back(ISD::FCOPYSIGN);
1648 case Intrinsic::floor:
1649 ISDs.push_back(ISD::FFLOOR);
1651 case Intrinsic::ceil:
1652 ISDs.push_back(ISD::FCEIL);
1654 case Intrinsic::trunc:
1655 ISDs.push_back(ISD::FTRUNC);
1657 case Intrinsic::nearbyint:
1658 ISDs.push_back(ISD::FNEARBYINT);
1660 case Intrinsic::rint:
1661 ISDs.push_back(ISD::FRINT);
1663 case Intrinsic::round:
1664 ISDs.push_back(ISD::FROUND);
1666 case Intrinsic::roundeven:
1667 ISDs.push_back(ISD::FROUNDEVEN);
1669 case Intrinsic::pow:
1670 ISDs.push_back(ISD::FPOW);
1672 case Intrinsic::fma:
1673 ISDs.push_back(ISD::FMA);
1675 case Intrinsic::fmuladd:
1676 ISDs.push_back(ISD::FMA);
1678 case Intrinsic::experimental_constrained_fmuladd:
1679 ISDs.push_back(ISD::STRICT_FMA);
1681 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1682 case Intrinsic::lifetime_start:
1683 case Intrinsic::lifetime_end:
1684 case Intrinsic::sideeffect:
1685 case Intrinsic::pseudoprobe:
1686 case Intrinsic::arithmetic_fence:
1688 case Intrinsic::masked_store: {
1690 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1691 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1694 case Intrinsic::masked_load: {
1696 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1697 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1700 case Intrinsic::vector_reduce_add:
1701 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1703 case Intrinsic::vector_reduce_mul:
1704 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1706 case Intrinsic::vector_reduce_and:
1707 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1709 case Intrinsic::vector_reduce_or:
1710 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None,
1712 case Intrinsic::vector_reduce_xor:
1713 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1715 case Intrinsic::vector_reduce_fadd:
1716 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1718 case Intrinsic::vector_reduce_fmul:
1719 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1721 case Intrinsic::vector_reduce_smax:
1722 case Intrinsic::vector_reduce_smin:
1723 case Intrinsic::vector_reduce_fmax:
1724 case Intrinsic::vector_reduce_fmin:
1725 return thisT()->getMinMaxReductionCost(
1726 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1727 /*IsUnsigned=*/false, CostKind);
1728 case Intrinsic::vector_reduce_umax:
1729 case Intrinsic::vector_reduce_umin:
1730 return thisT()->getMinMaxReductionCost(
1731 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1732 /*IsUnsigned=*/true, CostKind);
1733 case Intrinsic::abs: {
1734 // abs(X) = select(icmp(X,0),X,sub(0,X))
1735 Type *CondTy = RetTy->getWithNewBitWidth(1);
1736 CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1737 InstructionCost Cost = 0;
1738 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1740 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1742 // TODO: Should we add an OperandValueProperties::OP_Zero property?
1743 Cost += thisT()->getArithmeticInstrCost(
1744 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1747 case Intrinsic::smax:
1748 case Intrinsic::smin:
1749 case Intrinsic::umax:
1750 case Intrinsic::umin: {
1751 // minmax(X,Y) = select(icmp(X,Y),X,Y)
1752 Type *CondTy = RetTy->getWithNewBitWidth(1);
1753 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
1754 CmpInst::Predicate Pred =
1755 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
1756 InstructionCost Cost = 0;
1757 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1759 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1763 case Intrinsic::sadd_sat:
1764 case Intrinsic::ssub_sat: {
1765 Type *CondTy = RetTy->getWithNewBitWidth(1);
1767 Type *OpTy = StructType::create({RetTy, CondTy});
1768 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1769 ? Intrinsic::sadd_with_overflow
1770 : Intrinsic::ssub_with_overflow;
1771 CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1773 // SatMax -> Overflow && SumDiff < 0
1774 // SatMin -> Overflow && SumDiff >= 0
1775 InstructionCost Cost = 0;
1776 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1777 nullptr, ScalarizationCostPassed);
1778 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1779 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1781 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1782 CondTy, Pred, CostKind);
1785 case Intrinsic::uadd_sat:
1786 case Intrinsic::usub_sat: {
1787 Type *CondTy = RetTy->getWithNewBitWidth(1);
1789 Type *OpTy = StructType::create({RetTy, CondTy});
1790 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1791 ? Intrinsic::uadd_with_overflow
1792 : Intrinsic::usub_with_overflow;
1794 InstructionCost Cost = 0;
1795 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1796 nullptr, ScalarizationCostPassed);
1797 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1799 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1800 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1803 case Intrinsic::smul_fix:
1804 case Intrinsic::umul_fix: {
1805 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1806 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1809 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1810 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1812 InstructionCost Cost = 0;
1813 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1815 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1816 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1818 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1819 CostKind, TTI::OK_AnyValue,
1820 TTI::OK_UniformConstantValue);
1821 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1823 TTI::OK_UniformConstantValue);
1824 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1827 case Intrinsic::sadd_with_overflow:
1828 case Intrinsic::ssub_with_overflow: {
1829 Type *SumTy = RetTy->getContainedType(0);
1830 Type *OverflowTy = RetTy->getContainedType(1);
1831 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1832 ? BinaryOperator::Add
1833 : BinaryOperator::Sub;
1836 // Overflow -> (Result < LHS) ^ (RHS < 0)
1838 // Overflow -> (Result < LHS) ^ (RHS > 0)
1839 InstructionCost Cost = 0;
1840 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1841 Cost += 2 * thisT()->getCmpSelInstrCost(
1842 Instruction::ICmp, SumTy, OverflowTy,
1843 CmpInst::ICMP_SGT, CostKind);
1844 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
1848 case Intrinsic::uadd_with_overflow:
1849 case Intrinsic::usub_with_overflow: {
1850 Type *SumTy = RetTy->getContainedType(0);
1851 Type *OverflowTy = RetTy->getContainedType(1);
1852 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1853 ? BinaryOperator::Add
1854 : BinaryOperator::Sub;
1855 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
1857 : CmpInst::ICMP_UGT;
1859 InstructionCost Cost = 0;
1860 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1862 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1866 case Intrinsic::smul_with_overflow:
1867 case Intrinsic::umul_with_overflow: {
1868 Type *MulTy = RetTy->getContainedType(0);
1869 Type *OverflowTy = RetTy->getContainedType(1);
1870 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1871 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1872 bool IsSigned = IID == Intrinsic::smul_with_overflow;
1874 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
1875 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1877 InstructionCost Cost = 0;
1878 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1880 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1881 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1883 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
1884 CostKind, TTI::OK_AnyValue,
1885 TTI::OK_UniformConstantValue);
1888 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1889 CostKind, TTI::OK_AnyValue,
1890 TTI::OK_UniformConstantValue);
1892 Cost += thisT()->getCmpSelInstrCost(
1893 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
1896 case Intrinsic::ctpop:
1897 ISDs.push_back(ISD::CTPOP);
1898 // In case of legalization use TCC_Expensive. This is cheaper than a
1899 // library call but still not a cheap instruction.
1900 SingleCallCost = TargetTransformInfo::TCC_Expensive;
1902 case Intrinsic::ctlz:
1903 ISDs.push_back(ISD::CTLZ);
1905 case Intrinsic::cttz:
1906 ISDs.push_back(ISD::CTTZ);
1908 case Intrinsic::bswap:
1909 ISDs.push_back(ISD::BSWAP);
1911 case Intrinsic::bitreverse:
1912 ISDs.push_back(ISD::BITREVERSE);
1916 const TargetLoweringBase *TLI = getTLI();
1917 std::pair<InstructionCost, MVT> LT =
1918 TLI->getTypeLegalizationCost(DL, RetTy);
1920 SmallVector<InstructionCost, 2> LegalCost;
1921 SmallVector<InstructionCost, 2> CustomCost;
1922 for (unsigned ISD : ISDs) {
1923 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1924 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1925 TLI->isFAbsFree(LT.second)) {
1929 // The operation is legal. Assume it costs 1.
1930 // If the type is split to multiple registers, assume that there is some
1931 // overhead to this.
1932 // TODO: Once we have extract/insert subvector cost we need to use them.
1934 LegalCost.push_back(LT.first * 2);
1936 LegalCost.push_back(LT.first * 1);
1937 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1938 // If the operation is custom lowered then assume
1939 // that the code is twice as expensive.
1940 CustomCost.push_back(LT.first * 2);
1944 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1945 if (MinLegalCostI != LegalCost.end())
1946 return *MinLegalCostI;
1948 auto MinCustomCostI =
1949 std::min_element(CustomCost.begin(), CustomCost.end());
1950 if (MinCustomCostI != CustomCost.end())
1951 return *MinCustomCostI;
1953 // If we can't lower fmuladd into an FMA estimate the cost as a floating
1954 // point mul followed by an add.
1955 if (IID == Intrinsic::fmuladd)
1956 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1958 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1960 if (IID == Intrinsic::experimental_constrained_fmuladd) {
1961 IntrinsicCostAttributes FMulAttrs(
1962 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1963 IntrinsicCostAttributes FAddAttrs(
1964 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1965 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1966 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1969 // Else, assume that we need to scalarize this intrinsic. For math builtins
1970 // this will emit a costly libcall, adding call overhead and spills. Make it
1972 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1973 // Scalable vectors cannot be scalarized, so return Invalid.
1974 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1975 return isa<ScalableVectorType>(Ty);
1977 return InstructionCost::getInvalid();
1979 InstructionCost ScalarizationCost =
1980 SkipScalarizationCost ? ScalarizationCostPassed
1981 : getScalarizationOverhead(RetVTy, true, false);
1983 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1984 SmallVector<Type *, 4> ScalarTys;
1985 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1987 if (Ty->isVectorTy())
1988 Ty = Ty->getScalarType();
1989 ScalarTys.push_back(Ty);
1991 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1992 InstructionCost ScalarCost =
1993 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1994 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1995 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1996 if (!ICA.skipScalarizationCost())
1997 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1998 ScalarCalls = std::max(ScalarCalls,
1999 cast<FixedVectorType>(VTy)->getNumElements());
2002 return ScalarCalls * ScalarCost + ScalarizationCost;
2005 // This is going to be turned into a library call, make it expensive.
2006 return SingleCallCost;
2009 /// Compute a cost of the given call instruction.
2011 /// Compute the cost of calling function F with return type RetTy and
2012 /// argument types Tys. F might be nullptr, in this case the cost of an
2013 /// arbitrary call with the specified signature will be returned.
2014 /// This is used, for instance, when we estimate call of a vector
2015 /// counterpart of the given function.
2016 /// \param F Called function, might be nullptr.
2017 /// \param RetTy Return value types.
2018 /// \param Tys Argument types.
2019 /// \returns The cost of Call instruction.
2020 InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2021 ArrayRef<Type *> Tys,
2022 TTI::TargetCostKind CostKind) {
2026 unsigned getNumberOfParts(Type *Tp) {
2027 std::pair<InstructionCost, MVT> LT =
2028 getTLI()->getTypeLegalizationCost(DL, Tp);
2029 return LT.first.isValid() ? *LT.first.getValue() : 0;
2032 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2037 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2038 /// We're assuming that reduction operation are performing the following way:
2040 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2041 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2042 /// \----------------v-------------/ \----------v------------/
2043 /// n/2 elements n/2 elements
2044 /// %red1 = op <n x t> %val, <n x t> val1
2045 /// After this operation we have a vector %red1 where only the first n/2
2046 /// elements are meaningful, the second n/2 elements are undefined and can be
2047 /// dropped. All other operations are actually working with the vector of
2048 /// length n/2, not n, though the real vector length is still n.
2049 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2050 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2051 /// \----------------v-------------/ \----------v------------/
2052 /// n/4 elements 3*n/4 elements
2053 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2054 /// length n/2, the resulting vector has length n/4 etc.
2056 /// The cost model should take into account that the actual length of the
2057 /// vector is reduced on each iteration.
2058 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2059 TTI::TargetCostKind CostKind) {
2060 Type *ScalarTy = Ty->getElementType();
2061 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2062 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2063 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2065 // Or reduction for i1 is represented as:
2066 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2067 // %res = cmp ne iReduxWidth %val, 0
2068 // And reduction for i1 is represented as:
2069 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2070 // %res = cmp eq iReduxWidth %val, 11111
2071 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2072 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2073 TTI::CastContextHint::None, CostKind) +
2074 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2075 CmpInst::makeCmpResultType(ValTy),
2076 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2078 unsigned NumReduxLevels = Log2_32(NumVecElts);
2079 InstructionCost ArithCost = 0;
2080 InstructionCost ShuffleCost = 0;
2081 std::pair<InstructionCost, MVT> LT =
2082 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2083 unsigned LongVectorCount = 0;
2085 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2086 while (NumVecElts > MVTLen) {
2088 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2089 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2091 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2096 NumReduxLevels -= LongVectorCount;
2098 // The minimal length of the vector is limited by the real length of vector
2099 // operations performed on the current platform. That's why several final
2100 // reduction operations are performed on the vectors with the same
2101 // architecture-dependent length.
2103 // By default reductions need one shuffle per reduction level.
2104 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2105 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2107 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2108 return ShuffleCost + ArithCost +
2109 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2112 /// Try to calculate the cost of performing strict (in-order) reductions,
2113 /// which involves doing a sequence of floating point additions in lane
2114 /// order, starting with an initial value. For example, consider a scalar
2115 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2117 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2119 /// %add1 = %InitVal + %v0
2120 /// %add2 = %add1 + %v1
2121 /// %add3 = %add2 + %v2
2122 /// %add4 = %add3 + %v3
2124 /// As a simple estimate we can say the cost of such a reduction is 4 times
2125 /// the cost of a scalar FP addition. We can only estimate the costs for
2126 /// fixed-width vectors here because for scalable vectors we do not know the
2127 /// runtime number of operations.
2128 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2129 TTI::TargetCostKind CostKind) {
2130 // Targets must implement a default value for the scalable case, since
2131 // we don't know how many lanes the vector has.
2132 if (isa<ScalableVectorType>(Ty))
2133 return InstructionCost::getInvalid();
2135 auto *VTy = cast<FixedVectorType>(Ty);
2136 InstructionCost ExtractCost =
2137 getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
2138 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2139 Opcode, VTy->getElementType(), CostKind);
2140 ArithCost *= VTy->getNumElements();
2142 return ExtractCost + ArithCost;
2145 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2146 Optional<FastMathFlags> FMF,
2147 TTI::TargetCostKind CostKind) {
2148 if (TTI::requiresOrderedReduction(FMF))
2149 return getOrderedReductionCost(Opcode, Ty, CostKind);
2150 return getTreeReductionCost(Opcode, Ty, CostKind);
2153 /// Try to calculate op costs for min/max reduction operations.
2154 /// \param CondTy Conditional type for the Select instruction.
2155 InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2157 TTI::TargetCostKind CostKind) {
2158 Type *ScalarTy = Ty->getElementType();
2159 Type *ScalarCondTy = CondTy->getElementType();
2160 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2161 unsigned NumReduxLevels = Log2_32(NumVecElts);
2163 if (Ty->isFPOrFPVectorTy()) {
2164 CmpOpcode = Instruction::FCmp;
2166 assert(Ty->isIntOrIntVectorTy() &&
2167 "expecting floating point or integer type for min/max reduction");
2168 CmpOpcode = Instruction::ICmp;
2170 InstructionCost MinMaxCost = 0;
2171 InstructionCost ShuffleCost = 0;
2172 std::pair<InstructionCost, MVT> LT =
2173 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2174 unsigned LongVectorCount = 0;
2176 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2177 while (NumVecElts > MVTLen) {
2179 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2180 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2182 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2185 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2186 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2187 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2188 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2193 NumReduxLevels -= LongVectorCount;
2195 // The minimal length of the vector is limited by the real length of vector
2196 // operations performed on the current platform. That's why several final
2197 // reduction opertions are perfomed on the vectors with the same
2198 // architecture-dependent length.
2199 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2200 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2203 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2204 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2205 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2206 CmpInst::BAD_ICMP_PREDICATE, CostKind));
2207 // The last min/max should be in vector registers and we counted it above.
2208 // So just need a single extractelement.
2209 return ShuffleCost + MinMaxCost +
2210 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2213 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2214 Type *ResTy, VectorType *Ty,
2215 TTI::TargetCostKind CostKind) {
2216 // Without any native support, this is equivalent to the cost of
2217 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2218 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2219 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2220 Instruction::Add, ExtTy, None, CostKind);
2221 InstructionCost MulCost = 0;
2222 InstructionCost ExtCost = thisT()->getCastInstrCost(
2223 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2224 TTI::CastContextHint::None, CostKind);
2227 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2231 return RedCost + MulCost + ExtCost;
2234 InstructionCost getVectorSplitCost() { return 1; }
2239 /// Concrete BasicTTIImpl that can be used if no further customization
2241 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2242 using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2244 friend class BasicTTIImplBase<BasicTTIImpl>;
2246 const TargetSubtargetInfo *ST;
2247 const TargetLoweringBase *TLI;
2249 const TargetSubtargetInfo *getST() const { return ST; }
2250 const TargetLoweringBase *getTLI() const { return TLI; }
2253 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2256 } // end namespace llvm
2258 #endif // LLVM_CODEGEN_BASICTTIIMPL_H