1 //===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 /// This file implements a TargetTransformInfo analysis pass specific to the
9 /// Hexagon target machine. It uses the target's detailed information to provide
10 /// more precise answers to certain TTI queries, while letting the target
11 /// independent and default TTI implementations handle the rest.
13 //===----------------------------------------------------------------------===//
15 #include "HexagonTargetTransformInfo.h"
16 #include "HexagonSubtarget.h"
17 #include "llvm/Analysis/TargetTransformInfo.h"
18 #include "llvm/CodeGen/ValueTypes.h"
19 #include "llvm/IR/InstrTypes.h"
20 #include "llvm/IR/Instructions.h"
21 #include "llvm/IR/User.h"
22 #include "llvm/Support/Casting.h"
23 #include "llvm/Support/CommandLine.h"
24 #include "llvm/Transforms/Utils/UnrollLoop.h"
28 #define DEBUG_TYPE "hexagontti"
30 static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
31 cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
33 static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
34 cl::init(true), cl::Hidden,
35 cl::desc("Control lookup table emission on Hexagon target"));
37 // Constant "cost factor" to make floating point operations more expensive
38 // in terms of vectorization cost. This isn't the best way, but it should
39 // do. Ultimately, the cost should use cycles.
40 static const unsigned FloatFactor = 4;
42 bool HexagonTTIImpl::useHVX() const {
43 return ST.useHVXOps() && HexagonAutoHVX;
46 bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
47 assert(VecTy->isVectorTy());
48 if (isa<ScalableVectorType>(VecTy))
50 // Avoid types like <2 x i32*>.
51 if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
53 EVT VecVT = EVT::getEVT(VecTy);
54 if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64)
56 if (ST.isHVXVectorType(VecVT.getSimpleVT()))
58 auto Action = TLI.getPreferredVectorAction(VecVT.getSimpleVT());
59 return Action == TargetLoweringBase::TypeWidenVector;
62 unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const {
63 if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
64 return VTy->getNumElements();
65 assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) &&
66 "Expecting scalar type");
70 TargetTransformInfo::PopcntSupportKind
71 HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
72 // Return fast hardware support as every input < 64 bits will be promoted
74 return TargetTransformInfo::PSK_FastHardware;
77 // The Hexagon target can unroll loops with run-time trip counts.
78 void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
79 TTI::UnrollingPreferences &UP) {
80 UP.Runtime = UP.Partial = true;
83 void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
84 TTI::PeelingPreferences &PP) {
85 BaseT::getPeelingPreferences(L, SE, PP);
86 // Only try to peel innermost loops with small runtime trip counts.
87 if (L && L->empty() && canPeel(L) &&
88 SE.getSmallConstantTripCount(L) == 0 &&
89 SE.getSmallConstantMaxTripCount(L) > 0 &&
90 SE.getSmallConstantMaxTripCount(L) <= 5) {
95 bool HexagonTTIImpl::shouldFavorPostInc() const {
99 /// --- Vector TTI begin ---
101 unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const {
103 return useHVX() ? 32 : 0;
107 unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) {
108 return useHVX() ? 2 : 0;
111 unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
112 return Vector ? getMinVectorRegisterBitWidth() : 32;
115 unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
116 return useHVX() ? ST.getVectorLength()*8 : 0;
119 unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
120 return (8 * ST.getVectorLength()) / ElemWidth;
123 unsigned HexagonTTIImpl::getScalarizationOverhead(VectorType *Ty,
124 const APInt &DemandedElts,
125 bool Insert, bool Extract) {
126 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
129 unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
130 ArrayRef<const Value*> Args, unsigned VF) {
131 return BaseT::getOperandsScalarizationOverhead(Args, VF);
134 unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
135 ArrayRef<Type*> Tys, TTI::TargetCostKind CostKind) {
136 return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind);
140 HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
141 TTI::TargetCostKind CostKind) {
142 if (ICA.getID() == Intrinsic::bswap) {
143 std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ICA.getReturnType());
146 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
149 unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
150 ScalarEvolution *SE, const SCEV *S) {
154 unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
155 MaybeAlign Alignment,
156 unsigned AddressSpace,
157 TTI::TargetCostKind CostKind,
158 const Instruction *I) {
159 assert(Opcode == Instruction::Load || Opcode == Instruction::Store);
160 // TODO: Handle other cost kinds.
161 if (CostKind != TTI::TCK_RecipThroughput)
164 if (Opcode == Instruction::Store)
165 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
168 if (Src->isVectorTy()) {
169 VectorType *VecTy = cast<VectorType>(Src);
170 unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedSize();
171 if (useHVX() && isTypeForHVX(VecTy)) {
172 unsigned RegWidth = getRegisterBitWidth(true);
173 assert(RegWidth && "Non-zero vector register width expected");
174 // Cost of HVX loads.
175 if (VecWidth % RegWidth == 0)
176 return VecWidth / RegWidth;
177 // Cost of constructing HVX vector from scalar loads
178 const Align RegAlign(RegWidth / 8);
179 if (!Alignment || *Alignment > RegAlign)
180 Alignment = RegAlign;
182 unsigned AlignWidth = 8 * Alignment->value();
183 unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
188 // Add extra cost for floating point types.
190 VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1;
192 // At this point unspecified alignment is considered as Align(1).
193 const Align BoundAlignment = std::min(Alignment.valueOrOne(), Align(8));
194 unsigned AlignWidth = 8 * BoundAlignment.value();
195 unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
196 if (Alignment == Align(4) || Alignment == Align(8))
197 return Cost * NumLoads;
198 // Loads of less than 32 bits will need extra inserts to compose a vector.
199 assert(BoundAlignment <= Align(8));
200 unsigned LogA = Log2(BoundAlignment);
201 return (3 - LogA) * Cost * NumLoads;
204 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
208 unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
210 unsigned AddressSpace,
211 TTI::TargetCostKind CostKind) {
212 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
216 unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
217 int Index, Type *SubTp) {
221 unsigned HexagonTTIImpl::getGatherScatterOpCost(
222 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
223 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
224 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
225 Alignment, CostKind, I);
228 unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(
229 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
230 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
231 bool UseMaskForCond, bool UseMaskForGaps) {
232 if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
233 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
234 Alignment, AddressSpace,
236 UseMaskForCond, UseMaskForGaps);
237 return getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
241 unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
242 Type *CondTy, TTI::TargetCostKind CostKind, const Instruction *I) {
243 if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) {
244 std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
245 if (Opcode == Instruction::FCmp)
246 return LT.first + FloatFactor * getTypeNumElements(ValTy);
248 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
251 unsigned HexagonTTIImpl::getArithmeticInstrCost(
252 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
253 TTI::OperandValueKind Opd1Info,
254 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
255 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
256 const Instruction *CxtI) {
257 // TODO: Handle more cost kinds.
258 if (CostKind != TTI::TCK_RecipThroughput)
259 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
260 Opd2Info, Opd1PropInfo,
261 Opd2PropInfo, Args, CxtI);
263 if (Ty->isVectorTy()) {
264 std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty);
265 if (LT.second.isFloatingPoint())
266 return LT.first + FloatFactor * getTypeNumElements(Ty);
268 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
269 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
272 unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
273 Type *SrcTy, TTI::TargetCostKind CostKind, const Instruction *I) {
274 if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
275 unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0;
276 unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0;
278 std::pair<int, MVT> SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy);
279 std::pair<int, MVT> DstLT = TLI.getTypeLegalizationCost(DL, DstTy);
280 unsigned Cost = std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
281 // TODO: Allow non-throughput costs that aren't binary.
282 if (CostKind != TTI::TCK_RecipThroughput)
283 return Cost == 0 ? 0 : 1;
289 unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
291 Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
293 if (Opcode == Instruction::InsertElement) {
294 // Need two rotations for non-zero index.
295 unsigned Cost = (Index != 0) ? 2 : 0;
296 if (ElemTy->isIntegerTy(32))
298 // If it's not a 32-bit value, there will need to be an extract.
299 return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index);
302 if (Opcode == Instruction::ExtractElement)
308 /// --- Vector TTI end ---
310 unsigned HexagonTTIImpl::getPrefetchDistance() const {
311 return ST.getL1PrefetchDistance();
314 unsigned HexagonTTIImpl::getCacheLineSize() const {
315 return ST.getL1CacheLineSize();
319 HexagonTTIImpl::getUserCost(const User *U,
320 ArrayRef<const Value *> Operands,
321 TTI::TargetCostKind CostKind) {
322 auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
323 if (!CI->isIntegerCast())
325 // Only extensions from an integer type shorter than 32-bit to i32
326 // can be folded into the load.
327 const DataLayout &DL = getDataLayout();
328 unsigned SBW = DL.getTypeSizeInBits(CI->getSrcTy());
329 unsigned DBW = DL.getTypeSizeInBits(CI->getDestTy());
330 if (DBW != 32 || SBW >= DBW)
333 const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0));
334 // Technically, this code could allow multiple uses of the load, and
335 // check if all the uses are the same extension operation, but this
336 // should be sufficient for most cases.
337 return LI && LI->hasOneUse();
340 if (const CastInst *CI = dyn_cast<const CastInst>(U))
341 if (isCastFoldedIntoLoad(CI))
342 return TargetTransformInfo::TCC_Free;
343 return BaseT::getUserCost(U, Operands, CostKind);
346 bool HexagonTTIImpl::shouldBuildLookupTables() const {
347 return EmitLookupTables;