1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
14 //===----------------------------------------------------------------------===//
15 /// About Cost Model numbers used below it's necessary to say the following:
16 /// the numbers correspond to some "generic" X86 CPU instead of usage of
17 /// concrete CPU model. Usually the numbers correspond to CPU where the feature
18 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
19 /// the lookups below the cost is based on Nehalem as that was the first CPU
20 /// to support that feature level and thus has most likely the worst case cost.
21 /// Some examples of other technologies/CPUs:
22 /// SSE 3 - Pentium4 / Athlon64
25 /// AVX - Sandy Bridge
27 /// AVX-512 - Xeon Phi / Skylake
28 /// And some examples of instruction target dependent costs (latency)
29 /// divss sqrtss rsqrtss
31 /// Piledriver 9-24 13-15 5
33 /// Pentium II,III 18 30 2
34 /// Nehalem 7-14 7-18 3
35 /// Haswell 10-13 11 5
36 /// TODO: Develop and implement the target dependent cost model and
37 /// specialize cost numbers for different Cost Model Targets such as throughput,
38 /// code size, latency and uop count.
39 //===----------------------------------------------------------------------===//
41 #include "X86TargetTransformInfo.h"
42 #include "llvm/Analysis/TargetTransformInfo.h"
43 #include "llvm/CodeGen/BasicTTIImpl.h"
44 #include "llvm/CodeGen/CostTable.h"
45 #include "llvm/CodeGen/TargetLowering.h"
46 #include "llvm/IR/InstIterator.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/Support/Debug.h"
52 #define DEBUG_TYPE "x86tti"
54 //===----------------------------------------------------------------------===//
58 //===----------------------------------------------------------------------===//
60 TargetTransformInfo::PopcntSupportKind
61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
62 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
63 // TODO: Currently the __builtin_popcount() implementation using SSE3
64 // instructions is inefficient. Once the problem is fixed, we should
65 // call ST->hasSSE3() instead of ST->hasPOPCNT().
66 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
69 llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
70 TargetTransformInfo::CacheLevel Level) const {
72 case TargetTransformInfo::CacheLevel::L1D:
82 return 32 * 1024; // 32 KByte
83 case TargetTransformInfo::CacheLevel::L2D:
93 return 256 * 1024; // 256 KByte
96 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
99 llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
100 TargetTransformInfo::CacheLevel Level) const {
111 case TargetTransformInfo::CacheLevel::L1D:
113 case TargetTransformInfo::CacheLevel::L2D:
117 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
120 unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
121 bool Vector = (ClassID == 1);
122 if (Vector && !ST->hasSSE1())
126 if (Vector && ST->hasAVX512())
134 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
135 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
137 case TargetTransformInfo::RGK_Scalar:
138 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
139 case TargetTransformInfo::RGK_FixedWidthVector:
140 if (ST->hasAVX512() && PreferVectorWidth >= 512)
141 return TypeSize::getFixed(512);
142 if (ST->hasAVX() && PreferVectorWidth >= 256)
143 return TypeSize::getFixed(256);
144 if (ST->hasSSE1() && PreferVectorWidth >= 128)
145 return TypeSize::getFixed(128);
146 return TypeSize::getFixed(0);
147 case TargetTransformInfo::RGK_ScalableVector:
148 return TypeSize::getScalable(0);
151 llvm_unreachable("Unsupported register kind");
154 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
155 return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
159 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
160 // If the loop will not be vectorized, don't interleave the loop.
161 // Let regular unroll to unroll the loop, which saves the overflow
162 // check and memory check cost.
169 // Sandybridge and Haswell have multiple execution ports and pipelined
177 InstructionCost X86TTIImpl::getArithmeticInstrCost(
178 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
179 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
180 TTI::OperandValueProperties Opd1PropInfo,
181 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
182 const Instruction *CxtI) {
183 // TODO: Handle more cost kinds.
184 if (CostKind != TTI::TCK_RecipThroughput)
185 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
186 Op2Info, Opd1PropInfo,
187 Opd2PropInfo, Args, CxtI);
189 // vXi8 multiplications are always promoted to vXi16.
190 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
191 Ty->getScalarSizeInBits() == 8) {
193 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
194 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
195 TargetTransformInfo::CastContextHint::None,
197 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
198 TargetTransformInfo::CastContextHint::None,
200 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
201 Opd1PropInfo, Opd2PropInfo);
204 // Legalize the type.
205 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
207 int ISD = TLI->InstructionOpcodeToISD(Opcode);
208 assert(ISD && "Invalid opcode");
210 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
211 LT.second.getScalarType() == MVT::i32) {
212 // Check if the operands can be represented as a smaller datatype.
213 bool Op1Signed = false, Op2Signed = false;
214 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
215 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
216 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
218 // If both are representable as i15 and at least one is constant,
219 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
220 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
221 if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
223 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
225 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
226 bool Op1Sext = isa<SExtInst>(Args[0]) &&
227 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
228 bool Op2Sext = isa<SExtInst>(Args[1]) &&
229 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
231 bool IsZeroExtended = !Op1Signed || !Op2Signed;
232 bool IsConstant = Op1Constant || Op2Constant;
233 bool IsSext = Op1Sext || Op2Sext;
234 if (IsConstant || IsZeroExtended || IsSext)
236 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
240 // Vector multiply by pow2 will be simplified to shifts.
241 if (ISD == ISD::MUL &&
242 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
243 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
244 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2)
245 return getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, Op1Info,
246 Op2Info, TargetTransformInfo::OP_None,
247 TargetTransformInfo::OP_None);
249 // On X86, vector signed division by constants power-of-two are
250 // normally expanded to the sequence SRA + SRL + ADD + SRA.
251 // The OperandValue properties may not be the same as that of the previous
252 // operation; conservatively assume OP_None.
253 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
254 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
255 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
256 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
257 InstructionCost Cost =
258 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
259 Op2Info, TargetTransformInfo::OP_None,
260 TargetTransformInfo::OP_None);
261 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
262 Op2Info, TargetTransformInfo::OP_None,
263 TargetTransformInfo::OP_None);
264 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
265 Op2Info, TargetTransformInfo::OP_None,
266 TargetTransformInfo::OP_None);
268 if (ISD == ISD::SREM) {
269 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
270 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
272 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
279 // Vector unsigned division/remainder will be simplified to shifts/masks.
280 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
281 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
282 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
283 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
284 if (ISD == ISD::UDIV)
285 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
286 Op2Info, TargetTransformInfo::OP_None,
287 TargetTransformInfo::OP_None);
289 return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
290 Op2Info, TargetTransformInfo::OP_None,
291 TargetTransformInfo::OP_None);
294 static const CostTblEntry GLMCostTable[] = {
295 { ISD::FDIV, MVT::f32, 18 }, // divss
296 { ISD::FDIV, MVT::v4f32, 35 }, // divps
297 { ISD::FDIV, MVT::f64, 33 }, // divsd
298 { ISD::FDIV, MVT::v2f64, 65 }, // divpd
301 if (ST->useGLMDivSqrtCosts())
302 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
304 return LT.first * Entry->Cost;
306 static const CostTblEntry SLMCostTable[] = {
307 { ISD::MUL, MVT::v4i32, 11 }, // pmulld
308 { ISD::MUL, MVT::v8i16, 2 }, // pmullw
309 { ISD::FMUL, MVT::f64, 2 }, // mulsd
310 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
311 { ISD::FMUL, MVT::v4f32, 2 }, // mulps
312 { ISD::FDIV, MVT::f32, 17 }, // divss
313 { ISD::FDIV, MVT::v4f32, 39 }, // divps
314 { ISD::FDIV, MVT::f64, 32 }, // divsd
315 { ISD::FDIV, MVT::v2f64, 69 }, // divpd
316 { ISD::FADD, MVT::v2f64, 2 }, // addpd
317 { ISD::FSUB, MVT::v2f64, 2 }, // subpd
318 // v2i64/v4i64 mul is custom lowered as a series of long:
319 // multiplies(3), shifts(3) and adds(2)
320 // slm muldq version throughput is 2 and addq throughput 4
321 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
322 // 3X4 (addq throughput) = 17
323 { ISD::MUL, MVT::v2i64, 17 },
324 // slm addq\subq throughput is 4
325 { ISD::ADD, MVT::v2i64, 4 },
326 { ISD::SUB, MVT::v2i64, 4 },
329 if (ST->useSLMArithCosts()) {
330 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
331 // Check if the operands can be shrinked into a smaller datatype.
332 // TODO: Merge this into generiic vXi32 MUL patterns above.
333 bool Op1Signed = false;
334 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
335 bool Op2Signed = false;
336 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
338 bool SignedMode = Op1Signed || Op2Signed;
339 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
342 return LT.first * 3; // pmullw/sext
343 if (!SignedMode && OpMinSize <= 8)
344 return LT.first * 3; // pmullw/zext
346 return LT.first * 5; // pmullw/pmulhw/pshuf
347 if (!SignedMode && OpMinSize <= 16)
348 return LT.first * 5; // pmullw/pmulhw/pshuf
351 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
353 return LT.first * Entry->Cost;
357 static const CostTblEntry AVX512BWUniformConstCostTable[] = {
358 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
359 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
360 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
363 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
365 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
367 return LT.first * Entry->Cost;
370 static const CostTblEntry AVX512UniformConstCostTable[] = {
371 { ISD::SRA, MVT::v2i64, 1 },
372 { ISD::SRA, MVT::v4i64, 1 },
373 { ISD::SRA, MVT::v8i64, 1 },
375 { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
376 { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
377 { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
379 { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
380 { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
381 { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
382 { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
385 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
387 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
389 return LT.first * Entry->Cost;
392 static const CostTblEntry AVX2UniformConstCostTable[] = {
393 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
394 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
395 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
397 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
399 { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
400 { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
401 { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
402 { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
405 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
407 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
409 return LT.first * Entry->Cost;
412 static const CostTblEntry SSE2UniformConstCostTable[] = {
413 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
414 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
415 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
417 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
418 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
419 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
421 { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
422 { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
423 { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
424 { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
425 { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
426 { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
427 { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
428 { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
431 // XOP has faster vXi8 shifts.
432 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
433 ST->hasSSE2() && !ST->hasXOP()) {
434 if (const auto *Entry =
435 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
436 return LT.first * Entry->Cost;
439 static const CostTblEntry AVX512BWConstCostTable[] = {
440 { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
441 { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
442 { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
443 { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
444 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
445 { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
446 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
447 { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
450 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
451 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
453 if (const auto *Entry =
454 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
455 return LT.first * Entry->Cost;
458 static const CostTblEntry AVX512ConstCostTable[] = {
459 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
460 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
461 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
462 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
463 { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
464 { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
465 { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
466 { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
467 { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
468 { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
469 { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
470 { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
473 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
474 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
476 if (const auto *Entry =
477 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
478 return LT.first * Entry->Cost;
481 static const CostTblEntry AVX2ConstCostTable[] = {
482 { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
483 { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
484 { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
485 { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
486 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
487 { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
488 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
489 { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
490 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
491 { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
492 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
493 { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
496 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
497 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
499 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
500 return LT.first * Entry->Cost;
503 static const CostTblEntry SSE2ConstCostTable[] = {
504 { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
505 { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
506 { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
507 { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
508 { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
509 { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
510 { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
511 { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
512 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
513 { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
514 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
515 { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
516 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
517 { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
518 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
519 { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
520 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
521 { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
522 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
523 { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
524 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
525 { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
526 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
527 { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
530 if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
531 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
534 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
535 return LT.first * 32;
536 if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
537 return LT.first * 38;
538 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
539 return LT.first * 15;
540 if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
541 return LT.first * 20;
543 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
544 return LT.first * Entry->Cost;
547 static const CostTblEntry AVX512BWShiftCostTable[] = {
548 { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
549 { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
550 { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
551 { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
552 { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
553 { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
554 { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
555 { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
556 { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
558 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
559 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
560 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
561 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
562 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
563 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
564 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
565 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
566 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
570 if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
571 return LT.first * Entry->Cost;
573 static const CostTblEntry AVX2UniformCostTable[] = {
574 // Uniform splats are cheaper for the following instructions.
575 { ISD::SHL, MVT::v16i16, 1 }, // psllw.
576 { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
577 { ISD::SRA, MVT::v16i16, 1 }, // psraw.
578 { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
579 { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
580 { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
582 { ISD::SHL, MVT::v8i32, 1 }, // pslld
583 { ISD::SRL, MVT::v8i32, 1 }, // psrld
584 { ISD::SRA, MVT::v8i32, 1 }, // psrad
585 { ISD::SHL, MVT::v4i64, 1 }, // psllq
586 { ISD::SRL, MVT::v4i64, 1 }, // psrlq
590 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
591 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
592 if (const auto *Entry =
593 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
594 return LT.first * Entry->Cost;
597 static const CostTblEntry SSE2UniformCostTable[] = {
598 // Uniform splats are cheaper for the following instructions.
599 { ISD::SHL, MVT::v8i16, 1 }, // psllw.
600 { ISD::SHL, MVT::v4i32, 1 }, // pslld
601 { ISD::SHL, MVT::v2i64, 1 }, // psllq.
603 { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
604 { ISD::SRL, MVT::v4i32, 1 }, // psrld.
605 { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
607 { ISD::SRA, MVT::v8i16, 1 }, // psraw.
608 { ISD::SRA, MVT::v4i32, 1 }, // psrad.
612 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
613 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
614 if (const auto *Entry =
615 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
616 return LT.first * Entry->Cost;
619 static const CostTblEntry AVX512DQCostTable[] = {
620 { ISD::MUL, MVT::v2i64, 2 }, // pmullq
621 { ISD::MUL, MVT::v4i64, 2 }, // pmullq
622 { ISD::MUL, MVT::v8i64, 2 } // pmullq
625 // Look for AVX512DQ lowering tricks for custom cases.
627 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
628 return LT.first * Entry->Cost;
630 static const CostTblEntry AVX512BWCostTable[] = {
631 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
632 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
633 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
636 // Look for AVX512BW lowering tricks for custom cases.
638 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
639 return LT.first * Entry->Cost;
641 static const CostTblEntry AVX512CostTable[] = {
642 { ISD::SHL, MVT::v4i32, 1 },
643 { ISD::SRL, MVT::v4i32, 1 },
644 { ISD::SRA, MVT::v4i32, 1 },
645 { ISD::SHL, MVT::v8i32, 1 },
646 { ISD::SRL, MVT::v8i32, 1 },
647 { ISD::SRA, MVT::v8i32, 1 },
648 { ISD::SHL, MVT::v16i32, 1 },
649 { ISD::SRL, MVT::v16i32, 1 },
650 { ISD::SRA, MVT::v16i32, 1 },
652 { ISD::SHL, MVT::v2i64, 1 },
653 { ISD::SRL, MVT::v2i64, 1 },
654 { ISD::SHL, MVT::v4i64, 1 },
655 { ISD::SRL, MVT::v4i64, 1 },
656 { ISD::SHL, MVT::v8i64, 1 },
657 { ISD::SRL, MVT::v8i64, 1 },
659 { ISD::SRA, MVT::v2i64, 1 },
660 { ISD::SRA, MVT::v4i64, 1 },
661 { ISD::SRA, MVT::v8i64, 1 },
663 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
664 { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
665 { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
666 { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
667 { ISD::MUL, MVT::i64, 1 }, // Skylake from http://www.agner.org/
669 { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
670 { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
671 { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
672 { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
673 { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
674 { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
675 { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
676 { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
678 { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
679 { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
680 { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
681 { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
682 { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
683 { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
684 { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
685 { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
689 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
690 return LT.first * Entry->Cost;
692 static const CostTblEntry AVX2ShiftCostTable[] = {
693 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
694 // customize them to detect the cases where shift amount is a scalar one.
695 { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
696 { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
697 { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
698 { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
699 { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
700 { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
701 { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
702 { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
703 { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
704 { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
707 if (ST->hasAVX512()) {
708 if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
709 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
710 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
711 // On AVX512, a packed v32i16 shift left by a constant build_vector
712 // is lowered into a vector multiply (vpmullw).
713 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
715 TargetTransformInfo::OP_None,
716 TargetTransformInfo::OP_None);
719 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
720 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
721 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
722 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
723 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
724 // On AVX2, a packed v16i16 shift left by a constant build_vector
725 // is lowered into a vector multiply (vpmullw).
726 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
728 TargetTransformInfo::OP_None,
729 TargetTransformInfo::OP_None);
731 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
732 return LT.first * Entry->Cost;
735 static const CostTblEntry XOPShiftCostTable[] = {
736 // 128bit shifts take 1cy, but right shifts require negation beforehand.
737 { ISD::SHL, MVT::v16i8, 1 },
738 { ISD::SRL, MVT::v16i8, 2 },
739 { ISD::SRA, MVT::v16i8, 2 },
740 { ISD::SHL, MVT::v8i16, 1 },
741 { ISD::SRL, MVT::v8i16, 2 },
742 { ISD::SRA, MVT::v8i16, 2 },
743 { ISD::SHL, MVT::v4i32, 1 },
744 { ISD::SRL, MVT::v4i32, 2 },
745 { ISD::SRA, MVT::v4i32, 2 },
746 { ISD::SHL, MVT::v2i64, 1 },
747 { ISD::SRL, MVT::v2i64, 2 },
748 { ISD::SRA, MVT::v2i64, 2 },
749 // 256bit shifts require splitting if AVX2 didn't catch them above.
750 { ISD::SHL, MVT::v32i8, 2+2 },
751 { ISD::SRL, MVT::v32i8, 4+2 },
752 { ISD::SRA, MVT::v32i8, 4+2 },
753 { ISD::SHL, MVT::v16i16, 2+2 },
754 { ISD::SRL, MVT::v16i16, 4+2 },
755 { ISD::SRA, MVT::v16i16, 4+2 },
756 { ISD::SHL, MVT::v8i32, 2+2 },
757 { ISD::SRL, MVT::v8i32, 4+2 },
758 { ISD::SRA, MVT::v8i32, 4+2 },
759 { ISD::SHL, MVT::v4i64, 2+2 },
760 { ISD::SRL, MVT::v4i64, 4+2 },
761 { ISD::SRA, MVT::v4i64, 4+2 },
764 // Look for XOP lowering tricks.
766 // If the right shift is constant then we'll fold the negation so
767 // it's as cheap as a left shift.
769 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
770 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
771 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
773 if (const auto *Entry =
774 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
775 return LT.first * Entry->Cost;
778 static const CostTblEntry SSE2UniformShiftCostTable[] = {
779 // Uniform splats are cheaper for the following instructions.
780 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
781 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
782 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
784 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
785 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
786 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
788 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
789 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
790 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
791 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
795 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
796 (Op2Info == TargetTransformInfo::OK_UniformValue))) {
798 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
799 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
800 return LT.first * 4; // 2*psrad + shuffle.
802 if (const auto *Entry =
803 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
804 return LT.first * Entry->Cost;
807 if (ISD == ISD::SHL &&
808 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
810 // Vector shift left by non uniform constant can be lowered
811 // into vector multiply.
812 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
813 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
817 static const CostTblEntry AVX2CostTable[] = {
818 { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
819 { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
820 { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
821 { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
822 { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
823 { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
825 { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
826 { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
827 { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
828 { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
829 { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
830 { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
832 { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
833 { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
834 { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
835 { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
836 { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
837 { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
838 { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
839 { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
841 { ISD::SUB, MVT::v32i8, 1 }, // psubb
842 { ISD::ADD, MVT::v32i8, 1 }, // paddb
843 { ISD::SUB, MVT::v16i16, 1 }, // psubw
844 { ISD::ADD, MVT::v16i16, 1 }, // paddw
845 { ISD::SUB, MVT::v8i32, 1 }, // psubd
846 { ISD::ADD, MVT::v8i32, 1 }, // paddd
847 { ISD::SUB, MVT::v4i64, 1 }, // psubq
848 { ISD::ADD, MVT::v4i64, 1 }, // paddq
850 { ISD::MUL, MVT::v16i16, 1 }, // pmullw
851 { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
852 { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
854 { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
855 { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
856 { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
857 { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
858 { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
859 { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
860 { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
861 { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
862 { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
863 { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
865 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
866 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
867 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
868 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
869 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
870 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
873 // Look for AVX2 lowering tricks for custom cases.
875 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
876 return LT.first * Entry->Cost;
878 static const CostTblEntry AVX1CostTable[] = {
879 // We don't have to scalarize unsupported ops. We can issue two half-sized
880 // operations and we only need to extract the upper YMM half.
881 // Two ops + 1 extract + 1 insert = 4.
882 { ISD::MUL, MVT::v16i16, 4 },
883 { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
884 { ISD::MUL, MVT::v4i64, 12 },
886 { ISD::SUB, MVT::v32i8, 4 },
887 { ISD::ADD, MVT::v32i8, 4 },
888 { ISD::SUB, MVT::v16i16, 4 },
889 { ISD::ADD, MVT::v16i16, 4 },
890 { ISD::SUB, MVT::v8i32, 4 },
891 { ISD::ADD, MVT::v8i32, 4 },
892 { ISD::SUB, MVT::v4i64, 4 },
893 { ISD::ADD, MVT::v4i64, 4 },
895 { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
896 { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
897 { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
898 { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
899 { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
900 { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
901 { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
903 { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
904 { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
905 { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
906 { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
907 { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
908 { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
910 { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
911 { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
912 { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
913 { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
914 { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
915 { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
917 { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
918 { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
920 { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
921 { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
922 { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
924 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
925 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
926 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
927 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
928 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
929 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
933 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
934 return LT.first * Entry->Cost;
936 static const CostTblEntry SSE42CostTable[] = {
937 { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
938 { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
939 { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
940 { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
942 { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
943 { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
944 { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
945 { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
947 { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
948 { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
949 { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
950 { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
952 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
953 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
954 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
955 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
957 { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
961 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
962 return LT.first * Entry->Cost;
964 static const CostTblEntry SSE41CostTable[] = {
965 { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
966 { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
967 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
969 { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
970 { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
971 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
973 { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
974 { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
976 { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
980 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
981 return LT.first * Entry->Cost;
983 static const CostTblEntry SSE2CostTable[] = {
984 // We don't correctly identify costs of casts because they are marked as
986 { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
987 { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
988 { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
989 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
991 { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
992 { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
993 { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
994 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
996 { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
997 { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
998 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
999 { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
1001 { ISD::MUL, MVT::v8i16, 1 }, // pmullw
1002 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
1003 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
1005 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
1006 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
1007 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
1008 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
1010 { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
1011 { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
1012 { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
1013 { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
1015 { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1016 { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1018 { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
1019 { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
1023 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1024 return LT.first * Entry->Cost;
1026 static const CostTblEntry SSE1CostTable[] = {
1027 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
1028 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
1030 { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
1031 { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1033 { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1034 { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1036 { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
1037 { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
1041 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1042 return LT.first * Entry->Cost;
1044 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
1045 { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1046 { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
1047 { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/
1051 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1052 return LT.first * Entry->Cost;
1054 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1055 { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1056 { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1057 { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1059 { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
1060 { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
1061 { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
1064 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1065 return LT.first * Entry->Cost;
1067 // It is not a good idea to vectorize division. We have to scalarize it and
1068 // in the process we will often end up having to spilling regular
1069 // registers. The overhead of division is going to dominate most kernels
1070 // anyways so try hard to prevent vectorization of division - it is
1071 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1072 // to hide "20 cycles" for each lane.
1073 if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
1074 ISD == ISD::UDIV || ISD == ISD::UREM)) {
1075 InstructionCost ScalarCost = getArithmeticInstrCost(
1076 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
1077 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1078 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1081 // Fallback to the default implementation.
1082 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
1085 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1087 ArrayRef<int> Mask, int Index,
1089 ArrayRef<const Value *> Args) {
1090 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1091 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1092 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
1094 Kind = improveShuffleKindFromMask(Kind, Mask);
1095 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1096 if (Kind == TTI::SK_Transpose)
1097 Kind = TTI::SK_PermuteTwoSrc;
1099 // For Broadcasts we are splatting the first element from the first input
1100 // register, so only need to reference that input and all the output
1101 // registers are the same.
1102 if (Kind == TTI::SK_Broadcast)
1105 // Subvector extractions are free if they start at the beginning of a
1106 // vector and cheap if the subvectors are aligned.
1107 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1108 int NumElts = LT.second.getVectorNumElements();
1109 if ((Index % NumElts) == 0)
1111 std::pair<InstructionCost, MVT> SubLT =
1112 TLI->getTypeLegalizationCost(DL, SubTp);
1113 if (SubLT.second.isVector()) {
1114 int NumSubElts = SubLT.second.getVectorNumElements();
1115 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1117 // Handle some cases for widening legalization. For now we only handle
1118 // cases where the original subvector was naturally aligned and evenly
1119 // fit in its legalized subvector type.
1120 // FIXME: Remove some of the alignment restrictions.
1121 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1123 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1124 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1125 (NumSubElts % OrigSubElts) == 0 &&
1126 LT.second.getVectorElementType() ==
1127 SubLT.second.getVectorElementType() &&
1128 LT.second.getVectorElementType().getSizeInBits() ==
1129 BaseTp->getElementType()->getPrimitiveSizeInBits()) {
1130 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1131 "Unexpected number of elements!");
1132 auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
1133 LT.second.getVectorNumElements());
1134 auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
1135 SubLT.second.getVectorNumElements());
1136 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1137 InstructionCost ExtractCost = getShuffleCost(
1138 TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
1140 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1141 // if we have SSSE3 we can use pshufb.
1142 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1143 return ExtractCost + 1; // pshufd or pshufb
1145 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1146 "Unexpected vector size");
1148 return ExtractCost + 2; // worst case pshufhw + pshufd
1153 // Subvector insertions are cheap if the subvectors are aligned.
1154 // Note that in general, the insertion starting at the beginning of a vector
1155 // isn't free, because we need to preserve the rest of the wide vector.
1156 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1157 int NumElts = LT.second.getVectorNumElements();
1158 std::pair<InstructionCost, MVT> SubLT =
1159 TLI->getTypeLegalizationCost(DL, SubTp);
1160 if (SubLT.second.isVector()) {
1161 int NumSubElts = SubLT.second.getVectorNumElements();
1162 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1166 // If the insertion isn't aligned, treat it like a 2-op shuffle.
1167 Kind = TTI::SK_PermuteTwoSrc;
1170 // Handle some common (illegal) sub-vector types as they are often very cheap
1171 // to shuffle even on targets without PSHUFB.
1172 EVT VT = TLI->getValueType(DL, BaseTp);
1173 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1175 static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
1176 {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
1177 {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
1178 {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
1179 {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
1180 {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
1182 {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
1183 {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
1184 {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
1185 {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
1187 {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
1188 {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
1189 {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
1190 {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
1191 {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
1193 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
1194 {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
1195 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
1196 {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
1197 {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
1201 if (const auto *Entry =
1202 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1206 // We are going to permute multiple sources and the result will be in multiple
1207 // destinations. Providing an accurate cost only for splits where the element
1208 // type remains the same.
1209 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
1210 MVT LegalVT = LT.second;
1211 if (LegalVT.isVector() &&
1212 LegalVT.getVectorElementType().getSizeInBits() ==
1213 BaseTp->getElementType()->getPrimitiveSizeInBits() &&
1214 LegalVT.getVectorNumElements() <
1215 cast<FixedVectorType>(BaseTp)->getNumElements()) {
1217 unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
1218 unsigned LegalVTSize = LegalVT.getStoreSize();
1219 // Number of source vectors after legalization:
1220 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1221 // Number of destination vectors after legalization:
1222 InstructionCost NumOfDests = LT.first;
1224 auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
1225 LegalVT.getVectorNumElements());
1227 if (!Mask.empty() && NumOfDests.isValid()) {
1228 // Try to perform better estimation of the permutation.
1229 // 1. Split the source/destination vectors into real registers.
1230 // 2. Do the mask analysis to identify which real registers are
1231 // permuted. If more than 1 source registers are used for the
1232 // destination register building, the cost for this destination register
1233 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1234 // source register is used, build mask and calculate the cost as a cost
1235 // of PermuteSingleSrc.
1236 // Also, for the single register permute we try to identify if the
1237 // destination register is just a copy of the source register or the
1238 // copy of the previous destination register (the cost is
1239 // TTI::TCC_Basic). If the source register is just reused, the cost for
1240 // this operation is 0.
1241 unsigned E = *NumOfDests.getValue();
1242 unsigned NormalizedVF =
1243 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1244 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1245 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1246 SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem);
1247 copy(Mask, NormalizedMask.begin());
1248 unsigned PrevSrcReg = 0;
1249 ArrayRef<int> PrevRegMask;
1250 InstructionCost Cost = 0;
1251 processShuffleMasks(
1252 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1253 [this, SingleOpTy, &PrevSrcReg, &PrevRegMask,
1254 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1255 if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
1256 // Check if the previous register can be just copied to the next
1258 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1259 PrevRegMask != RegMask)
1260 Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
1261 RegMask, 0, nullptr);
1263 // Just a copy of previous destination register.
1264 Cost += TTI::TCC_Basic;
1267 if (SrcReg != DestReg &&
1268 any_of(RegMask, [](int I) { return I != UndefMaskElem; })) {
1269 // Just a copy of the source register.
1270 Cost += TTI::TCC_Basic;
1272 PrevSrcReg = SrcReg;
1273 PrevRegMask = RegMask;
1275 [this, SingleOpTy, &Cost](ArrayRef<int> RegMask,
1276 unsigned /*Unused*/,
1277 unsigned /*Unused*/) {
1278 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
1284 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1285 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1289 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1292 // For 2-input shuffles, we must account for splitting the 2 inputs into many.
1293 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
1294 // We assume that source and destination have the same vector type.
1295 InstructionCost NumOfDests = LT.first;
1296 InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
1297 LT.first = NumOfDests * NumOfShufflesPerDest;
1300 static const CostTblEntry AVX512VBMIShuffleTbl[] = {
1301 {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
1302 {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
1304 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
1305 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
1307 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
1308 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
1309 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
1313 if (const auto *Entry =
1314 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1315 return LT.first * Entry->Cost;
1317 static const CostTblEntry AVX512BWShuffleTbl[] = {
1318 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1319 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1320 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1322 {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
1323 {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
1324 {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
1325 {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
1327 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
1328 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
1329 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
1330 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
1331 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
1333 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
1334 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
1335 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
1336 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
1337 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
1339 {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
1340 {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
1344 if (const auto *Entry =
1345 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1346 return LT.first * Entry->Cost;
1348 static const CostTblEntry AVX512ShuffleTbl[] = {
1349 {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
1350 {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
1351 {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
1352 {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
1353 {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
1354 {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
1355 {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
1357 {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
1358 {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
1359 {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
1360 {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
1361 {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
1362 {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca
1363 {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
1365 {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
1366 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1367 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
1368 {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
1369 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1370 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
1371 {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
1372 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1373 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
1374 {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
1375 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1376 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
1377 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1379 {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
1380 {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
1381 {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
1382 {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
1383 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
1384 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
1385 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
1386 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
1387 {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
1388 {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
1389 {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
1390 {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
1392 // FIXME: This just applies the type legalization cost rules above
1393 // assuming these completely split.
1394 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
1395 {TTI::SK_PermuteSingleSrc, MVT::v32f16, 14},
1396 {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
1397 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
1398 {TTI::SK_PermuteTwoSrc, MVT::v32f16, 42},
1399 {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
1401 {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
1402 {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq
1403 {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
1404 {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
1405 {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
1406 {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
1407 {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
1410 if (ST->hasAVX512())
1411 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1412 return LT.first * Entry->Cost;
1414 static const CostTblEntry AVX2ShuffleTbl[] = {
1415 {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
1416 {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
1417 {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
1418 {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
1419 {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
1420 {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
1421 {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
1423 {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
1424 {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
1425 {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
1426 {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
1427 {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
1428 {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
1429 {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
1431 {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
1432 {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
1433 {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
1435 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
1436 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
1437 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
1438 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
1439 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
1441 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
1443 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
1446 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
1447 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
1448 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
1449 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
1450 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
1452 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
1454 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
1459 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
1460 return LT.first * Entry->Cost;
1462 static const CostTblEntry XOPShuffleTbl[] = {
1463 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
1464 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
1465 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
1466 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
1467 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
1469 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
1472 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
1474 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
1475 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
1477 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
1481 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
1482 return LT.first * Entry->Cost;
1484 static const CostTblEntry AVX1ShuffleTbl[] = {
1485 {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1486 {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1487 {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1488 {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1489 {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
1490 {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
1491 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
1493 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
1494 {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
1495 {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
1496 {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
1497 {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
1499 {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
1501 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
1504 {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
1505 {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
1506 {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
1507 {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
1508 {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
1509 {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
1510 {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
1512 {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
1513 {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
1514 {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1515 {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1516 {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
1517 // + 2*por + vinsertf128
1518 {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
1519 // + 2*por + vinsertf128
1520 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
1521 // + 2*por + vinsertf128
1523 {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
1524 {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
1525 {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
1526 {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
1527 {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
1528 // + 4*por + vinsertf128
1529 {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
1530 // + 4*por + vinsertf128
1531 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
1532 // + 4*por + vinsertf128
1536 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
1537 return LT.first * Entry->Cost;
1539 static const CostTblEntry SSE41ShuffleTbl[] = {
1540 {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
1541 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1542 {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
1543 {TTI::SK_Select, MVT::v4f32, 1}, // blendps
1544 {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
1545 {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
1546 {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
1550 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
1551 return LT.first * Entry->Cost;
1553 static const CostTblEntry SSSE3ShuffleTbl[] = {
1554 {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
1555 {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
1556 {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
1558 {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
1559 {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
1560 {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
1562 {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
1563 {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
1564 {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
1566 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
1567 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
1568 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
1570 {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
1571 {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
1572 {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
1576 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
1577 return LT.first * Entry->Cost;
1579 static const CostTblEntry SSE2ShuffleTbl[] = {
1580 {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
1581 {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
1582 {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
1583 {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
1584 {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
1585 {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
1587 {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
1588 {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
1589 {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
1590 {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
1591 {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
1592 {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
1593 // + 2*pshufd + 2*unpck + packus
1595 {TTI::SK_Select, MVT::v2i64, 1}, // movsd
1596 {TTI::SK_Select, MVT::v2f64, 1}, // movsd
1597 {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
1598 {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
1599 {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
1600 {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
1602 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
1603 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
1604 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
1605 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
1607 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
1609 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
1610 // + 2*pshufd + 2*unpck + 2*packus
1612 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
1613 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
1614 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
1615 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
1616 { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute
1617 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
1620 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
1621 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
1624 if (ST->hasSSE2()) {
1626 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
1627 if (ST->hasSSE3() && IsLoad)
1628 if (const auto *Entry =
1629 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
1630 assert(isLegalBroadcastLoad(BaseTp->getElementType(),
1631 LT.second.getVectorElementCount()) &&
1632 "Table entry missing from isLegalBroadcastLoad()");
1633 return LT.first * Entry->Cost;
1636 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
1637 return LT.first * Entry->Cost;
1640 static const CostTblEntry SSE1ShuffleTbl[] = {
1641 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
1642 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
1643 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
1644 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
1645 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
1649 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
1650 return LT.first * Entry->Cost;
1652 return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
1655 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1657 TTI::CastContextHint CCH,
1658 TTI::TargetCostKind CostKind,
1659 const Instruction *I) {
1660 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1661 assert(ISD && "Invalid opcode");
1663 // TODO: Allow non-throughput costs that aren't binary.
1664 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1665 if (CostKind != TTI::TCK_RecipThroughput)
1666 return Cost == 0 ? 0 : 1;
1670 // The cost tables include both specific, custom (non-legal) src/dst type
1671 // conversions and generic, legalized types. We test for customs first, before
1672 // falling back to legalization.
1673 // FIXME: Need a better design of the cost table to handle non-simple types of
1674 // potential massive combinations (elem_num x src_type x dst_type).
1675 static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
1676 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1677 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
1679 // Mask sign extend has an instruction.
1680 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1681 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
1682 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1683 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
1684 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1685 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
1686 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1687 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
1688 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1689 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
1690 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1691 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1692 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1693 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1694 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
1695 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
1696 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
1698 // Mask zero extend is a sext + shift.
1699 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1700 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
1701 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1702 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
1703 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1704 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
1705 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1706 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
1707 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1708 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
1709 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1710 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1711 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1712 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1713 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
1714 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
1715 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
1717 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
1718 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
1719 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
1720 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
1721 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
1722 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
1723 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
1724 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
1725 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
1726 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
1727 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
1728 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
1729 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
1730 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
1731 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
1732 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
1733 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
1735 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
1736 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
1737 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
1738 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
1739 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
1742 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
1743 // Mask sign extend has an instruction.
1744 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
1745 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
1746 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
1747 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
1748 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
1749 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
1750 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
1751 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
1753 // Mask zero extend is a sext + shift.
1754 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
1755 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
1756 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
1757 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
1758 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
1759 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
1760 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
1761 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
1763 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
1764 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
1765 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
1766 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
1767 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
1768 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
1769 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
1770 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
1772 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1773 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1775 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
1776 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
1778 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
1779 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
1781 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
1782 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
1785 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
1786 // 256-bit wide vectors.
1788 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
1789 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
1790 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
1791 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
1793 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
1794 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
1795 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
1796 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
1797 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
1798 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
1799 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
1800 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
1801 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
1802 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
1803 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
1804 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
1805 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
1806 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
1807 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
1808 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
1809 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
1810 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
1811 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
1812 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
1813 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
1814 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
1815 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
1816 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
1817 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
1818 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
1819 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
1820 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
1821 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
1822 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
1823 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
1824 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
1825 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
1826 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
1828 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
1829 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
1830 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
1832 // Sign extend is zmm vpternlogd+vptruncdb.
1833 // Zero extend is zmm broadcast load+vptruncdw.
1834 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
1835 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
1836 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
1837 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
1838 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
1839 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
1840 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
1841 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
1843 // Sign extend is zmm vpternlogd+vptruncdw.
1844 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
1845 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
1846 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
1847 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
1848 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
1849 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
1850 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
1851 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
1852 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
1854 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
1855 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
1856 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
1857 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
1858 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
1859 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
1860 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
1861 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
1862 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
1863 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
1865 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
1866 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
1867 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
1868 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
1870 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1871 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
1872 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1873 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
1874 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1875 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
1876 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1877 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
1878 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1879 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
1881 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1882 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
1884 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1885 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1886 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
1887 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
1888 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1889 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
1890 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1891 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1893 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
1894 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
1895 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
1896 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
1897 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
1898 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
1899 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
1900 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
1901 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
1902 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
1904 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
1905 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
1906 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
1907 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
1908 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
1909 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
1910 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
1911 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
1912 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
1913 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
1914 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
1916 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
1917 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
1918 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
1919 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
1920 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
1921 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
1924 static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
1925 // Mask sign extend has an instruction.
1926 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
1927 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
1928 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
1929 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
1930 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
1931 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
1932 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
1933 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
1934 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
1935 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
1936 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
1937 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
1938 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
1939 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
1940 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
1941 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
1942 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
1944 // Mask zero extend is a sext + shift.
1945 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
1946 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
1947 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
1948 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
1949 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
1950 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
1951 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
1952 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
1953 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
1954 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
1955 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
1956 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
1957 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
1958 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
1959 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
1960 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
1961 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
1963 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
1964 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
1965 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
1966 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
1967 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
1968 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
1969 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
1970 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
1971 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
1972 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
1973 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
1974 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
1975 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
1976 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
1977 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
1978 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
1979 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
1981 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
1984 static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
1985 // Mask sign extend has an instruction.
1986 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
1987 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
1988 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
1989 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
1990 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
1991 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
1992 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
1993 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
1995 // Mask zero extend is a sext + shift.
1996 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
1997 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
1998 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
1999 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
2000 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
2001 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
2002 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
2003 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
2005 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
2006 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
2007 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
2008 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
2009 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
2010 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
2011 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
2012 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2014 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2015 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2016 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2017 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2019 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
2020 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2021 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
2022 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
2024 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
2025 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
2026 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2027 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
2029 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
2030 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
2031 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2032 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
2035 static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
2036 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
2037 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
2038 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
2039 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
2040 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
2041 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
2042 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
2043 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
2044 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
2045 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
2046 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
2047 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
2048 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
2049 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
2050 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
2051 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
2052 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
2054 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2055 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2056 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
2057 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
2058 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
2059 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
2060 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
2061 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
2062 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
2063 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
2065 // sign extend is vpcmpeq+maskedmove+vpmovdw
2066 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2067 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
2068 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
2069 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
2070 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
2071 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
2072 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
2073 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
2074 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
2076 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
2077 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
2078 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
2079 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
2080 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
2081 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
2082 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
2083 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
2084 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
2085 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
2087 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2088 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
2089 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2090 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
2091 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2092 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
2093 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2094 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
2095 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2096 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
2097 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2098 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
2100 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2101 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2102 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2103 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2105 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
2106 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
2107 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2108 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
2109 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2110 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
2111 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2112 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2113 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2114 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2115 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
2116 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2117 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
2119 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2120 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
2121 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
2123 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
2124 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
2125 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2126 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
2127 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
2128 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
2129 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
2132 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
2133 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2134 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
2135 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2136 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
2137 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2138 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
2140 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2141 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
2142 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2143 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
2144 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2145 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2146 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2147 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
2148 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2149 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2150 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2151 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
2152 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2153 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2155 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
2157 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
2158 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
2159 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
2160 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
2161 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
2162 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
2163 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
2164 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
2165 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
2166 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
2167 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
2168 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
2170 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
2171 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
2173 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
2174 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
2175 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
2176 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
2178 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
2179 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
2180 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
2181 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2182 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2183 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
2184 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
2185 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
2187 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2188 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2189 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2190 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2191 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
2192 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
2193 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
2195 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
2196 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
2197 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
2198 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
2199 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
2200 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
2201 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
2202 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2203 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2204 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2207 static const TypeConversionCostTblEntry AVXConversionTbl[] = {
2208 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
2209 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
2210 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
2211 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
2212 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2213 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
2215 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2216 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
2217 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2218 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
2219 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2220 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
2221 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2222 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
2223 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2224 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
2225 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2226 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
2228 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
2229 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
2230 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
2231 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
2232 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
2234 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
2235 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
2236 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
2237 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
2238 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2239 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
2240 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
2241 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
2243 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
2244 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
2245 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
2246 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2247 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2248 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2249 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2250 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2251 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
2252 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
2253 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
2254 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
2256 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
2257 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
2258 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
2259 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
2260 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
2261 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2262 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
2263 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
2264 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
2265 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2266 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
2267 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
2268 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
2269 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
2270 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
2271 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
2272 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
2274 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
2275 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
2276 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
2277 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
2278 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
2279 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
2280 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
2281 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
2282 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
2283 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
2284 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
2286 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
2287 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
2288 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
2289 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
2290 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
2291 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
2292 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
2293 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
2294 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
2295 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2296 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
2297 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
2298 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
2300 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
2301 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
2304 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
2305 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2306 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
2307 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2308 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
2309 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2310 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2311 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2312 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
2313 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2314 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2315 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2316 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2318 // These truncates end up widening elements.
2319 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
2320 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
2321 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
2323 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
2324 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
2325 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
2327 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
2328 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
2329 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
2330 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
2331 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2332 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2333 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2334 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2335 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2336 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
2337 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
2339 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
2340 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
2341 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
2342 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
2343 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
2344 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
2345 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
2346 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
2347 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
2348 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2349 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
2350 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
2351 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
2352 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
2354 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
2355 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
2356 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
2357 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
2358 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
2359 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
2360 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
2361 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
2362 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2363 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
2365 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
2366 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2367 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
2368 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
2369 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
2370 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
2371 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
2372 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
2373 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
2374 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
2377 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
2378 // These are somewhat magic numbers justified by comparing the
2379 // output of llvm-mca for our various supported scheduler models
2380 // and basing it off the worst case scenario.
2381 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
2382 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
2383 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
2384 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
2385 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
2386 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2387 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
2388 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2389 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
2390 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
2391 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
2392 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
2394 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
2395 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
2396 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
2397 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
2398 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
2399 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
2400 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
2401 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
2402 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
2403 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
2404 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
2405 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
2406 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
2408 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
2409 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
2410 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
2411 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
2412 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
2413 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
2414 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
2415 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
2416 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
2417 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
2419 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
2420 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
2421 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
2422 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
2423 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
2424 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
2425 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
2426 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
2427 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
2428 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
2430 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2431 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
2432 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
2433 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
2434 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
2435 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
2436 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
2437 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
2438 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
2439 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
2440 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
2441 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
2443 // These truncates are really widening elements.
2444 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
2445 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
2446 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
2447 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
2448 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
2449 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
2451 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
2452 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
2453 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
2454 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
2455 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
2456 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
2457 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
2458 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
2459 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
2460 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
2461 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
2464 // Attempt to map directly to (simple) MVT types to let us match custom entries.
2465 EVT SrcTy = TLI->getValueType(DL, Src);
2466 EVT DstTy = TLI->getValueType(DL, Dst);
2468 // The function getSimpleVT only handles simple value types.
2469 if (SrcTy.isSimple() && DstTy.isSimple()) {
2470 MVT SimpleSrcTy = SrcTy.getSimpleVT();
2471 MVT SimpleDstTy = DstTy.getSimpleVT();
2473 if (ST->useAVX512Regs()) {
2475 if (const auto *Entry = ConvertCostTableLookup(
2476 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2477 return AdjustCost(Entry->Cost);
2480 if (const auto *Entry = ConvertCostTableLookup(
2481 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2482 return AdjustCost(Entry->Cost);
2484 if (ST->hasAVX512())
2485 if (const auto *Entry = ConvertCostTableLookup(
2486 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2487 return AdjustCost(Entry->Cost);
2491 if (const auto *Entry = ConvertCostTableLookup(
2492 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2493 return AdjustCost(Entry->Cost);
2496 if (const auto *Entry = ConvertCostTableLookup(
2497 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
2498 return AdjustCost(Entry->Cost);
2500 if (ST->hasAVX512())
2501 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2502 SimpleDstTy, SimpleSrcTy))
2503 return AdjustCost(Entry->Cost);
2505 if (ST->hasAVX2()) {
2506 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2507 SimpleDstTy, SimpleSrcTy))
2508 return AdjustCost(Entry->Cost);
2512 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2513 SimpleDstTy, SimpleSrcTy))
2514 return AdjustCost(Entry->Cost);
2517 if (ST->hasSSE41()) {
2518 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2519 SimpleDstTy, SimpleSrcTy))
2520 return AdjustCost(Entry->Cost);
2523 if (ST->hasSSE2()) {
2524 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2525 SimpleDstTy, SimpleSrcTy))
2526 return AdjustCost(Entry->Cost);
2530 // Fall back to legalized types.
2531 std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
2532 std::pair<InstructionCost, MVT> LTDest =
2533 TLI->getTypeLegalizationCost(DL, Dst);
2535 // If we're truncating to the same legalized type - just assume its free.
2536 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
2537 return TTI::TCC_Free;
2539 if (ST->useAVX512Regs()) {
2541 if (const auto *Entry = ConvertCostTableLookup(
2542 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
2543 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2546 if (const auto *Entry = ConvertCostTableLookup(
2547 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
2548 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2550 if (ST->hasAVX512())
2551 if (const auto *Entry = ConvertCostTableLookup(
2552 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
2553 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2557 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
2558 LTDest.second, LTSrc.second))
2559 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2562 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
2563 LTDest.second, LTSrc.second))
2564 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2566 if (ST->hasAVX512())
2567 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
2568 LTDest.second, LTSrc.second))
2569 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2572 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
2573 LTDest.second, LTSrc.second))
2574 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2577 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
2578 LTDest.second, LTSrc.second))
2579 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2582 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
2583 LTDest.second, LTSrc.second))
2584 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2587 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
2588 LTDest.second, LTSrc.second))
2589 return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
2591 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
2593 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
2594 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
2595 Type *ExtSrc = Src->getWithNewBitWidth(32);
2597 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
2599 // For scalar loads the extend would be free.
2600 InstructionCost ExtCost = 0;
2601 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
2602 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
2604 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
2605 TTI::CastContextHint::None, CostKind);
2608 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
2610 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
2611 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
2612 Type *TruncDst = Dst->getWithNewBitWidth(32);
2613 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
2614 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
2615 TTI::CastContextHint::None, CostKind);
2619 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2622 InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
2624 CmpInst::Predicate VecPred,
2625 TTI::TargetCostKind CostKind,
2626 const Instruction *I) {
2627 // TODO: Handle other cost kinds.
2628 if (CostKind != TTI::TCK_RecipThroughput)
2629 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2632 // Legalize the type.
2633 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2635 MVT MTy = LT.second;
2637 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2638 assert(ISD && "Invalid opcode");
2640 InstructionCost ExtraCost = 0;
2641 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
2642 // Some vector comparison predicates cost extra instructions.
2643 // TODO: Should we invert this and assume worst case cmp costs
2644 // and reduce for particular predicates?
2645 if (MTy.isVector() &&
2646 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
2647 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
2649 // Fallback to I if a specific predicate wasn't specified.
2650 CmpInst::Predicate Pred = VecPred;
2651 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
2652 Pred == CmpInst::BAD_FCMP_PREDICATE))
2653 Pred = cast<CmpInst>(I)->getPredicate();
2656 case CmpInst::Predicate::ICMP_NE:
2657 // xor(cmpeq(x,y),-1)
2660 case CmpInst::Predicate::ICMP_SGE:
2661 case CmpInst::Predicate::ICMP_SLE:
2662 // xor(cmpgt(x,y),-1)
2665 case CmpInst::Predicate::ICMP_ULT:
2666 case CmpInst::Predicate::ICMP_UGT:
2667 // cmpgt(xor(x,signbit),xor(y,signbit))
2668 // xor(cmpeq(pmaxu(x,y),x),-1)
2671 case CmpInst::Predicate::ICMP_ULE:
2672 case CmpInst::Predicate::ICMP_UGE:
2673 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
2674 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
2675 // cmpeq(psubus(x,y),0)
2676 // cmpeq(pminu(x,y),x)
2679 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
2683 case CmpInst::Predicate::BAD_ICMP_PREDICATE:
2684 case CmpInst::Predicate::BAD_FCMP_PREDICATE:
2685 // Assume worst case scenario and add the maximum extra cost.
2694 static const CostTblEntry SLMCostTbl[] = {
2695 // slm pcmpeq/pcmpgt throughput is 2
2696 { ISD::SETCC, MVT::v2i64, 2 },
2699 static const CostTblEntry AVX512BWCostTbl[] = {
2700 { ISD::SETCC, MVT::v32i16, 1 },
2701 { ISD::SETCC, MVT::v64i8, 1 },
2703 { ISD::SELECT, MVT::v32i16, 1 },
2704 { ISD::SELECT, MVT::v64i8, 1 },
2707 static const CostTblEntry AVX512CostTbl[] = {
2708 { ISD::SETCC, MVT::v8i64, 1 },
2709 { ISD::SETCC, MVT::v16i32, 1 },
2710 { ISD::SETCC, MVT::v8f64, 1 },
2711 { ISD::SETCC, MVT::v16f32, 1 },
2713 { ISD::SELECT, MVT::v8i64, 1 },
2714 { ISD::SELECT, MVT::v4i64, 1 },
2715 { ISD::SELECT, MVT::v2i64, 1 },
2716 { ISD::SELECT, MVT::v16i32, 1 },
2717 { ISD::SELECT, MVT::v8i32, 1 },
2718 { ISD::SELECT, MVT::v4i32, 1 },
2719 { ISD::SELECT, MVT::v8f64, 1 },
2720 { ISD::SELECT, MVT::v4f64, 1 },
2721 { ISD::SELECT, MVT::v2f64, 1 },
2722 { ISD::SELECT, MVT::f64, 1 },
2723 { ISD::SELECT, MVT::v16f32, 1 },
2724 { ISD::SELECT, MVT::v8f32 , 1 },
2725 { ISD::SELECT, MVT::v4f32, 1 },
2726 { ISD::SELECT, MVT::f32 , 1 },
2728 { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
2729 { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
2731 { ISD::SELECT, MVT::v32i16, 2 },
2732 { ISD::SELECT, MVT::v16i16, 1 },
2733 { ISD::SELECT, MVT::v8i16, 1 },
2734 { ISD::SELECT, MVT::v64i8, 2 },
2735 { ISD::SELECT, MVT::v32i8, 1 },
2736 { ISD::SELECT, MVT::v16i8, 1 },
2739 static const CostTblEntry AVX2CostTbl[] = {
2740 { ISD::SETCC, MVT::v4i64, 1 },
2741 { ISD::SETCC, MVT::v8i32, 1 },
2742 { ISD::SETCC, MVT::v16i16, 1 },
2743 { ISD::SETCC, MVT::v32i8, 1 },
2745 { ISD::SELECT, MVT::v4f64, 2 }, // vblendvpd
2746 { ISD::SELECT, MVT::v8f32, 2 }, // vblendvps
2747 { ISD::SELECT, MVT::v4i64, 2 }, // pblendvb
2748 { ISD::SELECT, MVT::v8i32, 2 }, // pblendvb
2749 { ISD::SELECT, MVT::v16i16, 2 }, // pblendvb
2750 { ISD::SELECT, MVT::v32i8, 2 }, // pblendvb
2753 static const CostTblEntry AVX1CostTbl[] = {
2754 { ISD::SETCC, MVT::v4f64, 1 },
2755 { ISD::SETCC, MVT::v8f32, 1 },
2756 // AVX1 does not support 8-wide integer compare.
2757 { ISD::SETCC, MVT::v4i64, 4 },
2758 { ISD::SETCC, MVT::v8i32, 4 },
2759 { ISD::SETCC, MVT::v16i16, 4 },
2760 { ISD::SETCC, MVT::v32i8, 4 },
2762 { ISD::SELECT, MVT::v4f64, 3 }, // vblendvpd
2763 { ISD::SELECT, MVT::v8f32, 3 }, // vblendvps
2764 { ISD::SELECT, MVT::v4i64, 3 }, // vblendvpd
2765 { ISD::SELECT, MVT::v8i32, 3 }, // vblendvps
2766 { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
2767 { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
2770 static const CostTblEntry SSE42CostTbl[] = {
2771 { ISD::SETCC, MVT::v2i64, 1 },
2774 static const CostTblEntry SSE41CostTbl[] = {
2775 { ISD::SETCC, MVT::v2f64, 1 },
2776 { ISD::SETCC, MVT::v4f32, 1 },
2778 { ISD::SELECT, MVT::v2f64, 2 }, // blendvpd
2779 { ISD::SELECT, MVT::f64, 2 }, // blendvpd
2780 { ISD::SELECT, MVT::v4f32, 2 }, // blendvps
2781 { ISD::SELECT, MVT::f32 , 2 }, // blendvps
2782 { ISD::SELECT, MVT::v2i64, 2 }, // pblendvb
2783 { ISD::SELECT, MVT::v4i32, 2 }, // pblendvb
2784 { ISD::SELECT, MVT::v8i16, 2 }, // pblendvb
2785 { ISD::SELECT, MVT::v16i8, 2 }, // pblendvb
2788 static const CostTblEntry SSE2CostTbl[] = {
2789 { ISD::SETCC, MVT::v2f64, 2 },
2790 { ISD::SETCC, MVT::f64, 1 },
2791 { ISD::SETCC, MVT::v2i64, 5 }, // pcmpeqd/pcmpgtd expansion
2792 { ISD::SETCC, MVT::v4i32, 1 },
2793 { ISD::SETCC, MVT::v8i16, 1 },
2794 { ISD::SETCC, MVT::v16i8, 1 },
2796 { ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd
2797 { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd
2798 { ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por
2799 { ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por
2800 { ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por
2801 { ISD::SELECT, MVT::v16i8, 2 }, // pand + pandn + por
2804 static const CostTblEntry SSE1CostTbl[] = {
2805 { ISD::SETCC, MVT::v4f32, 2 },
2806 { ISD::SETCC, MVT::f32, 1 },
2808 { ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps
2809 { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps
2812 if (ST->useSLMArithCosts())
2813 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
2814 return LT.first * (ExtraCost + Entry->Cost);
2817 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
2818 return LT.first * (ExtraCost + Entry->Cost);
2820 if (ST->hasAVX512())
2821 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
2822 return LT.first * (ExtraCost + Entry->Cost);
2825 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
2826 return LT.first * (ExtraCost + Entry->Cost);
2829 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
2830 return LT.first * (ExtraCost + Entry->Cost);
2833 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
2834 return LT.first * (ExtraCost + Entry->Cost);
2837 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
2838 return LT.first * (ExtraCost + Entry->Cost);
2841 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
2842 return LT.first * (ExtraCost + Entry->Cost);
2845 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
2846 return LT.first * (ExtraCost + Entry->Cost);
2848 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2851 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
2854 X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2855 TTI::TargetCostKind CostKind) {
2857 // Costs should match the codegen from:
2858 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
2859 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
2860 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
2861 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
2862 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
2864 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
2865 // specialized in these tables yet.
2866 static const CostTblEntry AVX512BITALGCostTbl[] = {
2867 { ISD::CTPOP, MVT::v32i16, 1 },
2868 { ISD::CTPOP, MVT::v64i8, 1 },
2869 { ISD::CTPOP, MVT::v16i16, 1 },
2870 { ISD::CTPOP, MVT::v32i8, 1 },
2871 { ISD::CTPOP, MVT::v8i16, 1 },
2872 { ISD::CTPOP, MVT::v16i8, 1 },
2874 static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
2875 { ISD::CTPOP, MVT::v8i64, 1 },
2876 { ISD::CTPOP, MVT::v16i32, 1 },
2877 { ISD::CTPOP, MVT::v4i64, 1 },
2878 { ISD::CTPOP, MVT::v8i32, 1 },
2879 { ISD::CTPOP, MVT::v2i64, 1 },
2880 { ISD::CTPOP, MVT::v4i32, 1 },
2882 static const CostTblEntry AVX512CDCostTbl[] = {
2883 { ISD::CTLZ, MVT::v8i64, 1 },
2884 { ISD::CTLZ, MVT::v16i32, 1 },
2885 { ISD::CTLZ, MVT::v32i16, 8 },
2886 { ISD::CTLZ, MVT::v64i8, 20 },
2887 { ISD::CTLZ, MVT::v4i64, 1 },
2888 { ISD::CTLZ, MVT::v8i32, 1 },
2889 { ISD::CTLZ, MVT::v16i16, 4 },
2890 { ISD::CTLZ, MVT::v32i8, 10 },
2891 { ISD::CTLZ, MVT::v2i64, 1 },
2892 { ISD::CTLZ, MVT::v4i32, 1 },
2893 { ISD::CTLZ, MVT::v8i16, 4 },
2894 { ISD::CTLZ, MVT::v16i8, 4 },
2896 static const CostTblEntry AVX512BWCostTbl[] = {
2897 { ISD::ABS, MVT::v32i16, 1 },
2898 { ISD::ABS, MVT::v64i8, 1 },
2899 { ISD::BITREVERSE, MVT::v8i64, 3 },
2900 { ISD::BITREVERSE, MVT::v16i32, 3 },
2901 { ISD::BITREVERSE, MVT::v32i16, 3 },
2902 { ISD::BITREVERSE, MVT::v64i8, 2 },
2903 { ISD::BSWAP, MVT::v8i64, 1 },
2904 { ISD::BSWAP, MVT::v16i32, 1 },
2905 { ISD::BSWAP, MVT::v32i16, 1 },
2906 { ISD::CTLZ, MVT::v8i64, 23 },
2907 { ISD::CTLZ, MVT::v16i32, 22 },
2908 { ISD::CTLZ, MVT::v32i16, 18 },
2909 { ISD::CTLZ, MVT::v64i8, 17 },
2910 { ISD::CTPOP, MVT::v8i64, 7 },
2911 { ISD::CTPOP, MVT::v16i32, 11 },
2912 { ISD::CTPOP, MVT::v32i16, 9 },
2913 { ISD::CTPOP, MVT::v64i8, 6 },
2914 { ISD::CTTZ, MVT::v8i64, 10 },
2915 { ISD::CTTZ, MVT::v16i32, 14 },
2916 { ISD::CTTZ, MVT::v32i16, 12 },
2917 { ISD::CTTZ, MVT::v64i8, 9 },
2918 { ISD::SADDSAT, MVT::v32i16, 1 },
2919 { ISD::SADDSAT, MVT::v64i8, 1 },
2920 { ISD::SMAX, MVT::v32i16, 1 },
2921 { ISD::SMAX, MVT::v64i8, 1 },
2922 { ISD::SMIN, MVT::v32i16, 1 },
2923 { ISD::SMIN, MVT::v64i8, 1 },
2924 { ISD::SSUBSAT, MVT::v32i16, 1 },
2925 { ISD::SSUBSAT, MVT::v64i8, 1 },
2926 { ISD::UADDSAT, MVT::v32i16, 1 },
2927 { ISD::UADDSAT, MVT::v64i8, 1 },
2928 { ISD::UMAX, MVT::v32i16, 1 },
2929 { ISD::UMAX, MVT::v64i8, 1 },
2930 { ISD::UMIN, MVT::v32i16, 1 },
2931 { ISD::UMIN, MVT::v64i8, 1 },
2932 { ISD::USUBSAT, MVT::v32i16, 1 },
2933 { ISD::USUBSAT, MVT::v64i8, 1 },
2935 static const CostTblEntry AVX512CostTbl[] = {
2936 { ISD::ABS, MVT::v8i64, 1 },
2937 { ISD::ABS, MVT::v16i32, 1 },
2938 { ISD::ABS, MVT::v32i16, 2 },
2939 { ISD::ABS, MVT::v64i8, 2 },
2940 { ISD::ABS, MVT::v4i64, 1 },
2941 { ISD::ABS, MVT::v2i64, 1 },
2942 { ISD::BITREVERSE, MVT::v8i64, 36 },
2943 { ISD::BITREVERSE, MVT::v16i32, 24 },
2944 { ISD::BITREVERSE, MVT::v32i16, 10 },
2945 { ISD::BITREVERSE, MVT::v64i8, 10 },
2946 { ISD::BSWAP, MVT::v8i64, 4 },
2947 { ISD::BSWAP, MVT::v16i32, 4 },
2948 { ISD::BSWAP, MVT::v32i16, 4 },
2949 { ISD::CTLZ, MVT::v8i64, 29 },
2950 { ISD::CTLZ, MVT::v16i32, 35 },
2951 { ISD::CTLZ, MVT::v32i16, 28 },
2952 { ISD::CTLZ, MVT::v64i8, 18 },
2953 { ISD::CTPOP, MVT::v8i64, 16 },
2954 { ISD::CTPOP, MVT::v16i32, 24 },
2955 { ISD::CTPOP, MVT::v32i16, 18 },
2956 { ISD::CTPOP, MVT::v64i8, 12 },
2957 { ISD::CTTZ, MVT::v8i64, 20 },
2958 { ISD::CTTZ, MVT::v16i32, 28 },
2959 { ISD::CTTZ, MVT::v32i16, 24 },
2960 { ISD::CTTZ, MVT::v64i8, 18 },
2961 { ISD::SMAX, MVT::v8i64, 1 },
2962 { ISD::SMAX, MVT::v16i32, 1 },
2963 { ISD::SMAX, MVT::v32i16, 2 },
2964 { ISD::SMAX, MVT::v64i8, 2 },
2965 { ISD::SMAX, MVT::v4i64, 1 },
2966 { ISD::SMAX, MVT::v2i64, 1 },
2967 { ISD::SMIN, MVT::v8i64, 1 },
2968 { ISD::SMIN, MVT::v16i32, 1 },
2969 { ISD::SMIN, MVT::v32i16, 2 },
2970 { ISD::SMIN, MVT::v64i8, 2 },
2971 { ISD::SMIN, MVT::v4i64, 1 },
2972 { ISD::SMIN, MVT::v2i64, 1 },
2973 { ISD::UMAX, MVT::v8i64, 1 },
2974 { ISD::UMAX, MVT::v16i32, 1 },
2975 { ISD::UMAX, MVT::v32i16, 2 },
2976 { ISD::UMAX, MVT::v64i8, 2 },
2977 { ISD::UMAX, MVT::v4i64, 1 },
2978 { ISD::UMAX, MVT::v2i64, 1 },
2979 { ISD::UMIN, MVT::v8i64, 1 },
2980 { ISD::UMIN, MVT::v16i32, 1 },
2981 { ISD::UMIN, MVT::v32i16, 2 },
2982 { ISD::UMIN, MVT::v64i8, 2 },
2983 { ISD::UMIN, MVT::v4i64, 1 },
2984 { ISD::UMIN, MVT::v2i64, 1 },
2985 { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
2986 { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
2987 { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
2988 { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
2989 { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
2990 { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
2991 { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
2992 { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
2993 { ISD::SADDSAT, MVT::v32i16, 2 },
2994 { ISD::SADDSAT, MVT::v64i8, 2 },
2995 { ISD::SSUBSAT, MVT::v32i16, 2 },
2996 { ISD::SSUBSAT, MVT::v64i8, 2 },
2997 { ISD::UADDSAT, MVT::v32i16, 2 },
2998 { ISD::UADDSAT, MVT::v64i8, 2 },
2999 { ISD::USUBSAT, MVT::v32i16, 2 },
3000 { ISD::USUBSAT, MVT::v64i8, 2 },
3001 { ISD::FMAXNUM, MVT::f32, 2 },
3002 { ISD::FMAXNUM, MVT::v4f32, 2 },
3003 { ISD::FMAXNUM, MVT::v8f32, 2 },
3004 { ISD::FMAXNUM, MVT::v16f32, 2 },
3005 { ISD::FMAXNUM, MVT::f64, 2 },
3006 { ISD::FMAXNUM, MVT::v2f64, 2 },
3007 { ISD::FMAXNUM, MVT::v4f64, 2 },
3008 { ISD::FMAXNUM, MVT::v8f64, 2 },
3010 static const CostTblEntry XOPCostTbl[] = {
3011 { ISD::BITREVERSE, MVT::v4i64, 4 },
3012 { ISD::BITREVERSE, MVT::v8i32, 4 },
3013 { ISD::BITREVERSE, MVT::v16i16, 4 },
3014 { ISD::BITREVERSE, MVT::v32i8, 4 },
3015 { ISD::BITREVERSE, MVT::v2i64, 1 },
3016 { ISD::BITREVERSE, MVT::v4i32, 1 },
3017 { ISD::BITREVERSE, MVT::v8i16, 1 },
3018 { ISD::BITREVERSE, MVT::v16i8, 1 },
3019 { ISD::BITREVERSE, MVT::i64, 3 },
3020 { ISD::BITREVERSE, MVT::i32, 3 },
3021 { ISD::BITREVERSE, MVT::i16, 3 },
3022 { ISD::BITREVERSE, MVT::i8, 3 }
3024 static const CostTblEntry AVX2CostTbl[] = {
3025 { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3026 { ISD::ABS, MVT::v8i32, 1 },
3027 { ISD::ABS, MVT::v16i16, 1 },
3028 { ISD::ABS, MVT::v32i8, 1 },
3029 { ISD::BITREVERSE, MVT::v2i64, 3 },
3030 { ISD::BITREVERSE, MVT::v4i64, 3 },
3031 { ISD::BITREVERSE, MVT::v4i32, 3 },
3032 { ISD::BITREVERSE, MVT::v8i32, 3 },
3033 { ISD::BITREVERSE, MVT::v8i16, 3 },
3034 { ISD::BITREVERSE, MVT::v16i16, 3 },
3035 { ISD::BITREVERSE, MVT::v16i8, 3 },
3036 { ISD::BITREVERSE, MVT::v32i8, 3 },
3037 { ISD::BSWAP, MVT::v4i64, 1 },
3038 { ISD::BSWAP, MVT::v8i32, 1 },
3039 { ISD::BSWAP, MVT::v16i16, 1 },
3040 { ISD::CTLZ, MVT::v2i64, 7 },
3041 { ISD::CTLZ, MVT::v4i64, 7 },
3042 { ISD::CTLZ, MVT::v4i32, 5 },
3043 { ISD::CTLZ, MVT::v8i32, 5 },
3044 { ISD::CTLZ, MVT::v8i16, 4 },
3045 { ISD::CTLZ, MVT::v16i16, 4 },
3046 { ISD::CTLZ, MVT::v16i8, 3 },
3047 { ISD::CTLZ, MVT::v32i8, 3 },
3048 { ISD::CTPOP, MVT::v2i64, 3 },
3049 { ISD::CTPOP, MVT::v4i64, 3 },
3050 { ISD::CTPOP, MVT::v4i32, 7 },
3051 { ISD::CTPOP, MVT::v8i32, 7 },
3052 { ISD::CTPOP, MVT::v8i16, 3 },
3053 { ISD::CTPOP, MVT::v16i16, 3 },
3054 { ISD::CTPOP, MVT::v16i8, 2 },
3055 { ISD::CTPOP, MVT::v32i8, 2 },
3056 { ISD::CTTZ, MVT::v2i64, 4 },
3057 { ISD::CTTZ, MVT::v4i64, 4 },
3058 { ISD::CTTZ, MVT::v4i32, 7 },
3059 { ISD::CTTZ, MVT::v8i32, 7 },
3060 { ISD::CTTZ, MVT::v8i16, 4 },
3061 { ISD::CTTZ, MVT::v16i16, 4 },
3062 { ISD::CTTZ, MVT::v16i8, 3 },
3063 { ISD::CTTZ, MVT::v32i8, 3 },
3064 { ISD::SADDSAT, MVT::v16i16, 1 },
3065 { ISD::SADDSAT, MVT::v32i8, 1 },
3066 { ISD::SMAX, MVT::v8i32, 1 },
3067 { ISD::SMAX, MVT::v16i16, 1 },
3068 { ISD::SMAX, MVT::v32i8, 1 },
3069 { ISD::SMIN, MVT::v8i32, 1 },
3070 { ISD::SMIN, MVT::v16i16, 1 },
3071 { ISD::SMIN, MVT::v32i8, 1 },
3072 { ISD::SSUBSAT, MVT::v16i16, 1 },
3073 { ISD::SSUBSAT, MVT::v32i8, 1 },
3074 { ISD::UADDSAT, MVT::v16i16, 1 },
3075 { ISD::UADDSAT, MVT::v32i8, 1 },
3076 { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
3077 { ISD::UMAX, MVT::v8i32, 1 },
3078 { ISD::UMAX, MVT::v16i16, 1 },
3079 { ISD::UMAX, MVT::v32i8, 1 },
3080 { ISD::UMIN, MVT::v8i32, 1 },
3081 { ISD::UMIN, MVT::v16i16, 1 },
3082 { ISD::UMIN, MVT::v32i8, 1 },
3083 { ISD::USUBSAT, MVT::v16i16, 1 },
3084 { ISD::USUBSAT, MVT::v32i8, 1 },
3085 { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
3086 { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
3087 { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
3088 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
3089 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
3090 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
3091 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
3092 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
3093 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
3095 static const CostTblEntry AVX1CostTbl[] = {
3096 { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3097 { ISD::ABS, MVT::v8i32, 3 },
3098 { ISD::ABS, MVT::v16i16, 3 },
3099 { ISD::ABS, MVT::v32i8, 3 },
3100 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
3101 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
3102 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
3103 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
3104 { ISD::BSWAP, MVT::v4i64, 4 },
3105 { ISD::BSWAP, MVT::v8i32, 4 },
3106 { ISD::BSWAP, MVT::v16i16, 4 },
3107 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
3108 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
3109 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
3110 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
3111 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
3112 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
3113 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
3114 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
3115 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
3116 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
3117 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
3118 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
3119 { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3120 { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3121 { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3122 { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3123 { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3124 { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3125 { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3126 { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3127 { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3128 { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3129 { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3130 { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3131 { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
3132 { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3133 { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3134 { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3135 { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
3136 { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3137 { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3138 { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
3139 { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
3140 { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
3141 { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
3142 { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
3143 { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
3144 { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
3145 { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
3146 { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
3147 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
3148 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
3149 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
3150 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
3151 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
3152 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
3154 static const CostTblEntry GLMCostTbl[] = {
3155 { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
3156 { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
3157 { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
3158 { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
3160 static const CostTblEntry SLMCostTbl[] = {
3161 { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
3162 { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
3163 { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
3164 { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
3166 static const CostTblEntry SSE42CostTbl[] = {
3167 { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
3168 { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
3169 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
3170 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
3172 static const CostTblEntry SSE41CostTbl[] = {
3173 { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
3174 { ISD::SMAX, MVT::v4i32, 1 },
3175 { ISD::SMAX, MVT::v16i8, 1 },
3176 { ISD::SMIN, MVT::v4i32, 1 },
3177 { ISD::SMIN, MVT::v16i8, 1 },
3178 { ISD::UMAX, MVT::v4i32, 1 },
3179 { ISD::UMAX, MVT::v8i16, 1 },
3180 { ISD::UMIN, MVT::v4i32, 1 },
3181 { ISD::UMIN, MVT::v8i16, 1 },
3183 static const CostTblEntry SSSE3CostTbl[] = {
3184 { ISD::ABS, MVT::v4i32, 1 },
3185 { ISD::ABS, MVT::v8i16, 1 },
3186 { ISD::ABS, MVT::v16i8, 1 },
3187 { ISD::BITREVERSE, MVT::v2i64, 5 },
3188 { ISD::BITREVERSE, MVT::v4i32, 5 },
3189 { ISD::BITREVERSE, MVT::v8i16, 5 },
3190 { ISD::BITREVERSE, MVT::v16i8, 5 },
3191 { ISD::BSWAP, MVT::v2i64, 1 },
3192 { ISD::BSWAP, MVT::v4i32, 1 },
3193 { ISD::BSWAP, MVT::v8i16, 1 },
3194 { ISD::CTLZ, MVT::v2i64, 23 },
3195 { ISD::CTLZ, MVT::v4i32, 18 },
3196 { ISD::CTLZ, MVT::v8i16, 14 },
3197 { ISD::CTLZ, MVT::v16i8, 9 },
3198 { ISD::CTPOP, MVT::v2i64, 7 },
3199 { ISD::CTPOP, MVT::v4i32, 11 },
3200 { ISD::CTPOP, MVT::v8i16, 9 },
3201 { ISD::CTPOP, MVT::v16i8, 6 },
3202 { ISD::CTTZ, MVT::v2i64, 10 },
3203 { ISD::CTTZ, MVT::v4i32, 14 },
3204 { ISD::CTTZ, MVT::v8i16, 12 },
3205 { ISD::CTTZ, MVT::v16i8, 9 }
3207 static const CostTblEntry SSE2CostTbl[] = {
3208 { ISD::ABS, MVT::v2i64, 4 },
3209 { ISD::ABS, MVT::v4i32, 3 },
3210 { ISD::ABS, MVT::v8i16, 2 },
3211 { ISD::ABS, MVT::v16i8, 2 },
3212 { ISD::BITREVERSE, MVT::v2i64, 29 },
3213 { ISD::BITREVERSE, MVT::v4i32, 27 },
3214 { ISD::BITREVERSE, MVT::v8i16, 27 },
3215 { ISD::BITREVERSE, MVT::v16i8, 20 },
3216 { ISD::BSWAP, MVT::v2i64, 7 },
3217 { ISD::BSWAP, MVT::v4i32, 7 },
3218 { ISD::BSWAP, MVT::v8i16, 7 },
3219 { ISD::CTLZ, MVT::v2i64, 25 },
3220 { ISD::CTLZ, MVT::v4i32, 26 },
3221 { ISD::CTLZ, MVT::v8i16, 20 },
3222 { ISD::CTLZ, MVT::v16i8, 17 },
3223 { ISD::CTPOP, MVT::v2i64, 12 },
3224 { ISD::CTPOP, MVT::v4i32, 15 },
3225 { ISD::CTPOP, MVT::v8i16, 13 },
3226 { ISD::CTPOP, MVT::v16i8, 10 },
3227 { ISD::CTTZ, MVT::v2i64, 14 },
3228 { ISD::CTTZ, MVT::v4i32, 18 },
3229 { ISD::CTTZ, MVT::v8i16, 16 },
3230 { ISD::CTTZ, MVT::v16i8, 13 },
3231 { ISD::SADDSAT, MVT::v8i16, 1 },
3232 { ISD::SADDSAT, MVT::v16i8, 1 },
3233 { ISD::SMAX, MVT::v8i16, 1 },
3234 { ISD::SMIN, MVT::v8i16, 1 },
3235 { ISD::SSUBSAT, MVT::v8i16, 1 },
3236 { ISD::SSUBSAT, MVT::v16i8, 1 },
3237 { ISD::UADDSAT, MVT::v8i16, 1 },
3238 { ISD::UADDSAT, MVT::v16i8, 1 },
3239 { ISD::UMAX, MVT::v8i16, 2 },
3240 { ISD::UMAX, MVT::v16i8, 1 },
3241 { ISD::UMIN, MVT::v8i16, 2 },
3242 { ISD::UMIN, MVT::v16i8, 1 },
3243 { ISD::USUBSAT, MVT::v8i16, 1 },
3244 { ISD::USUBSAT, MVT::v16i8, 1 },
3245 { ISD::FMAXNUM, MVT::f64, 4 },
3246 { ISD::FMAXNUM, MVT::v2f64, 4 },
3247 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
3248 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
3250 static const CostTblEntry SSE1CostTbl[] = {
3251 { ISD::FMAXNUM, MVT::f32, 4 },
3252 { ISD::FMAXNUM, MVT::v4f32, 4 },
3253 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
3254 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
3256 static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
3257 { ISD::CTTZ, MVT::i64, 1 },
3259 static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
3260 { ISD::CTTZ, MVT::i32, 1 },
3261 { ISD::CTTZ, MVT::i16, 1 },
3262 { ISD::CTTZ, MVT::i8, 1 },
3264 static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
3265 { ISD::CTLZ, MVT::i64, 1 },
3267 static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
3268 { ISD::CTLZ, MVT::i32, 1 },
3269 { ISD::CTLZ, MVT::i16, 1 },
3270 { ISD::CTLZ, MVT::i8, 1 },
3272 static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
3273 { ISD::CTPOP, MVT::i64, 1 },
3275 static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
3276 { ISD::CTPOP, MVT::i32, 1 },
3277 { ISD::CTPOP, MVT::i16, 1 },
3278 { ISD::CTPOP, MVT::i8, 1 },
3280 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3281 { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
3282 { ISD::BITREVERSE, MVT::i64, 14 },
3283 { ISD::BSWAP, MVT::i64, 1 },
3284 { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
3285 { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
3286 { ISD::CTPOP, MVT::i64, 10 },
3287 { ISD::SADDO, MVT::i64, 1 },
3288 { ISD::UADDO, MVT::i64, 1 },
3289 { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
3291 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3292 { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
3293 { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
3294 { ISD::BITREVERSE, MVT::i32, 14 },
3295 { ISD::BITREVERSE, MVT::i16, 14 },
3296 { ISD::BITREVERSE, MVT::i8, 11 },
3297 { ISD::BSWAP, MVT::i32, 1 },
3298 { ISD::BSWAP, MVT::i16, 1 }, // ROL
3299 { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
3300 { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
3301 { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
3302 { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
3303 { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
3304 { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
3305 { ISD::CTPOP, MVT::i32, 8 },
3306 { ISD::CTPOP, MVT::i16, 9 },
3307 { ISD::CTPOP, MVT::i8, 7 },
3308 { ISD::SADDO, MVT::i32, 1 },
3309 { ISD::SADDO, MVT::i16, 1 },
3310 { ISD::SADDO, MVT::i8, 1 },
3311 { ISD::UADDO, MVT::i32, 1 },
3312 { ISD::UADDO, MVT::i16, 1 },
3313 { ISD::UADDO, MVT::i8, 1 },
3314 { ISD::UMULO, MVT::i32, 2 }, // mul + seto
3315 { ISD::UMULO, MVT::i16, 2 },
3316 { ISD::UMULO, MVT::i8, 2 },
3319 Type *RetTy = ICA.getReturnType();
3321 Intrinsic::ID IID = ICA.getID();
3322 unsigned ISD = ISD::DELETED_NODE;
3326 case Intrinsic::abs:
3329 case Intrinsic::bitreverse:
3330 ISD = ISD::BITREVERSE;
3332 case Intrinsic::bswap:
3335 case Intrinsic::ctlz:
3338 case Intrinsic::ctpop:
3341 case Intrinsic::cttz:
3344 case Intrinsic::maxnum:
3345 case Intrinsic::minnum:
3346 // FMINNUM has same costs so don't duplicate.
3349 case Intrinsic::sadd_sat:
3352 case Intrinsic::smax:
3355 case Intrinsic::smin:
3358 case Intrinsic::ssub_sat:
3361 case Intrinsic::uadd_sat:
3364 case Intrinsic::umax:
3367 case Intrinsic::umin:
3370 case Intrinsic::usub_sat:
3373 case Intrinsic::sqrt:
3376 case Intrinsic::sadd_with_overflow:
3377 case Intrinsic::ssub_with_overflow:
3378 // SSUBO has same costs so don't duplicate.
3380 OpTy = RetTy->getContainedType(0);
3382 case Intrinsic::uadd_with_overflow:
3383 case Intrinsic::usub_with_overflow:
3384 // USUBO has same costs so don't duplicate.
3386 OpTy = RetTy->getContainedType(0);
3388 case Intrinsic::umul_with_overflow:
3389 case Intrinsic::smul_with_overflow:
3390 // SMULO has same costs so don't duplicate.
3392 OpTy = RetTy->getContainedType(0);
3396 if (ISD != ISD::DELETED_NODE) {
3397 // Legalize the type.
3398 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
3399 MVT MTy = LT.second;
3401 // Attempt to lookup cost.
3402 if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
3404 // With PSHUFB the code is very similar for all types. If we have integer
3405 // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
3406 // we also need a PSHUFB.
3407 unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
3409 // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
3410 // instructions. We also need an extract and an insert.
3411 if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
3412 (ST->hasBWI() && MTy.is512BitVector())))
3413 Cost = Cost * 2 + 2;
3415 return LT.first * Cost;
3418 auto adjustTableCost = [](const CostTblEntry &Entry,
3419 InstructionCost LegalizationCost,
3420 FastMathFlags FMF) {
3421 // If there are no NANs to deal with, then these are reduced to a
3422 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
3423 // assume is used in the non-fast case.
3424 if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
3426 return LegalizationCost * 1;
3428 return LegalizationCost * (int)Entry.Cost;
3431 if (ST->useGLMDivSqrtCosts())
3432 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
3433 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3435 if (ST->useSLMArithCosts())
3436 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3437 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3439 if (ST->hasBITALG())
3440 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
3441 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3443 if (ST->hasVPOPCNTDQ())
3444 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
3445 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3448 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
3449 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3452 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3453 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3455 if (ST->hasAVX512())
3456 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3457 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3460 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3461 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3464 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3465 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3468 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3469 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3472 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3473 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3476 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3477 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3480 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
3481 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3484 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3485 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3488 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3489 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3493 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
3494 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3496 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
3497 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3500 if (ST->hasLZCNT()) {
3502 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
3503 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3505 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
3506 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3509 if (ST->hasPOPCNT()) {
3511 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
3512 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3514 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
3515 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3518 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
3519 if (const Instruction *II = ICA.getInst()) {
3520 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
3521 return TTI::TCC_Free;
3522 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
3523 if (LI->hasOneUse())
3524 return TTI::TCC_Free;
3530 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3531 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3533 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3534 return adjustTableCost(*Entry, LT.first, ICA.getFlags());
3537 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
3541 X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
3542 TTI::TargetCostKind CostKind) {
3543 if (ICA.isTypeBasedOnly())
3544 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
3546 static const CostTblEntry AVX512BWCostTbl[] = {
3547 { ISD::ROTL, MVT::v32i16, 2 },
3548 { ISD::ROTL, MVT::v16i16, 2 },
3549 { ISD::ROTL, MVT::v8i16, 2 },
3550 { ISD::ROTL, MVT::v64i8, 5 },
3551 { ISD::ROTL, MVT::v32i8, 5 },
3552 { ISD::ROTL, MVT::v16i8, 5 },
3553 { ISD::ROTR, MVT::v32i16, 2 },
3554 { ISD::ROTR, MVT::v16i16, 2 },
3555 { ISD::ROTR, MVT::v8i16, 2 },
3556 { ISD::ROTR, MVT::v64i8, 5 },
3557 { ISD::ROTR, MVT::v32i8, 5 },
3558 { ISD::ROTR, MVT::v16i8, 5 }
3560 static const CostTblEntry AVX512CostTbl[] = {
3561 { ISD::ROTL, MVT::v8i64, 1 },
3562 { ISD::ROTL, MVT::v4i64, 1 },
3563 { ISD::ROTL, MVT::v2i64, 1 },
3564 { ISD::ROTL, MVT::v16i32, 1 },
3565 { ISD::ROTL, MVT::v8i32, 1 },
3566 { ISD::ROTL, MVT::v4i32, 1 },
3567 { ISD::ROTR, MVT::v8i64, 1 },
3568 { ISD::ROTR, MVT::v4i64, 1 },
3569 { ISD::ROTR, MVT::v2i64, 1 },
3570 { ISD::ROTR, MVT::v16i32, 1 },
3571 { ISD::ROTR, MVT::v8i32, 1 },
3572 { ISD::ROTR, MVT::v4i32, 1 }
3574 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3575 static const CostTblEntry XOPCostTbl[] = {
3576 { ISD::ROTL, MVT::v4i64, 4 },
3577 { ISD::ROTL, MVT::v8i32, 4 },
3578 { ISD::ROTL, MVT::v16i16, 4 },
3579 { ISD::ROTL, MVT::v32i8, 4 },
3580 { ISD::ROTL, MVT::v2i64, 1 },
3581 { ISD::ROTL, MVT::v4i32, 1 },
3582 { ISD::ROTL, MVT::v8i16, 1 },
3583 { ISD::ROTL, MVT::v16i8, 1 },
3584 { ISD::ROTR, MVT::v4i64, 6 },
3585 { ISD::ROTR, MVT::v8i32, 6 },
3586 { ISD::ROTR, MVT::v16i16, 6 },
3587 { ISD::ROTR, MVT::v32i8, 6 },
3588 { ISD::ROTR, MVT::v2i64, 2 },
3589 { ISD::ROTR, MVT::v4i32, 2 },
3590 { ISD::ROTR, MVT::v8i16, 2 },
3591 { ISD::ROTR, MVT::v16i8, 2 }
3593 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
3594 { ISD::ROTL, MVT::i64, 1 },
3595 { ISD::ROTR, MVT::i64, 1 },
3596 { ISD::FSHL, MVT::i64, 4 }
3598 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
3599 { ISD::ROTL, MVT::i32, 1 },
3600 { ISD::ROTL, MVT::i16, 1 },
3601 { ISD::ROTL, MVT::i8, 1 },
3602 { ISD::ROTR, MVT::i32, 1 },
3603 { ISD::ROTR, MVT::i16, 1 },
3604 { ISD::ROTR, MVT::i8, 1 },
3605 { ISD::FSHL, MVT::i32, 4 },
3606 { ISD::FSHL, MVT::i16, 4 },
3607 { ISD::FSHL, MVT::i8, 4 }
3610 Intrinsic::ID IID = ICA.getID();
3611 Type *RetTy = ICA.getReturnType();
3612 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
3613 unsigned ISD = ISD::DELETED_NODE;
3617 case Intrinsic::fshl:
3619 if (Args[0] == Args[1])
3622 case Intrinsic::fshr:
3623 // FSHR has same costs so don't duplicate.
3625 if (Args[0] == Args[1])
3630 if (ISD != ISD::DELETED_NODE) {
3631 // Legalize the type.
3632 std::pair<InstructionCost, MVT> LT =
3633 TLI->getTypeLegalizationCost(DL, RetTy);
3634 MVT MTy = LT.second;
3636 // Attempt to lookup cost.
3638 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3639 return LT.first * Entry->Cost;
3641 if (ST->hasAVX512())
3642 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3643 return LT.first * Entry->Cost;
3646 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3647 return LT.first * Entry->Cost;
3650 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
3651 return LT.first * Entry->Cost;
3653 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
3654 return LT.first * Entry->Cost;
3657 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
3660 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
3662 static const CostTblEntry SLMCostTbl[] = {
3663 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
3664 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
3665 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
3666 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
3669 assert(Val->isVectorTy() && "This must be a vector type");
3670 Type *ScalarType = Val->getScalarType();
3671 InstructionCost RegisterFileMoveCost = 0;
3673 // Non-immediate extraction/insertion can be handled as a sequence of
3674 // aliased loads+stores via the stack.
3675 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
3676 Opcode == Instruction::InsertElement)) {
3677 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
3678 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
3680 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
3681 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
3682 Align VecAlign = DL.getPrefTypeAlign(Val);
3683 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
3685 // Extract - store vector to stack, load scalar.
3686 if (Opcode == Instruction::ExtractElement) {
3687 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
3688 TTI::TargetCostKind::TCK_RecipThroughput) +
3689 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
3690 TTI::TargetCostKind::TCK_RecipThroughput);
3692 // Insert - store vector to stack, store scalar, load vector.
3693 if (Opcode == Instruction::InsertElement) {
3694 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
3695 TTI::TargetCostKind::TCK_RecipThroughput) +
3696 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
3697 TTI::TargetCostKind::TCK_RecipThroughput) +
3698 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0,
3699 TTI::TargetCostKind::TCK_RecipThroughput);
3703 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
3704 Opcode == Instruction::InsertElement)) {
3705 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
3706 if (Opcode == Instruction::ExtractElement &&
3707 ScalarType->getScalarSizeInBits() == 1 &&
3708 cast<FixedVectorType>(Val)->getNumElements() > 1)
3711 // Legalize the type.
3712 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
3714 // This type is legalized to a scalar type.
3715 if (!LT.second.isVector())
3718 // The type may be split. Normalize the index to the new type.
3719 unsigned SizeInBits = LT.second.getSizeInBits();
3720 unsigned NumElts = LT.second.getVectorNumElements();
3721 unsigned SubNumElts = NumElts;
3722 Index = Index % NumElts;
3724 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
3725 // For inserts, we also need to insert the subvector back.
3726 if (SizeInBits > 128) {
3727 assert((SizeInBits % 128) == 0 && "Illegal vector");
3728 unsigned NumSubVecs = SizeInBits / 128;
3729 SubNumElts = NumElts / NumSubVecs;
3730 if (SubNumElts <= Index) {
3731 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
3732 Index %= SubNumElts;
3737 // Floating point scalars are already located in index #0.
3738 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
3740 if (ScalarType->isFloatingPointTy())
3741 return RegisterFileMoveCost;
3743 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
3744 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
3745 return 1 + RegisterFileMoveCost;
3748 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3749 assert(ISD && "Unexpected vector opcode");
3750 MVT MScalarTy = LT.second.getScalarType();
3751 if (ST->useSLMArithCosts())
3752 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
3753 return Entry->Cost + RegisterFileMoveCost;
3755 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
3756 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3757 (MScalarTy.isInteger() && ST->hasSSE41()))
3758 return 1 + RegisterFileMoveCost;
3760 // Assume insertps is relatively cheap on all targets.
3761 if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
3762 Opcode == Instruction::InsertElement)
3763 return 1 + RegisterFileMoveCost;
3765 // For extractions we just need to shuffle the element to index 0, which
3766 // should be very cheap (assume cost = 1). For insertions we need to shuffle
3767 // the elements to its destination. In both cases we must handle the
3768 // subvector move(s).
3769 // If the vector type is already less than 128-bits then don't reduce it.
3770 // TODO: Under what circumstances should we shuffle using the full width?
3771 InstructionCost ShuffleCost = 1;
3772 if (Opcode == Instruction::InsertElement) {
3773 auto *SubTy = cast<VectorType>(Val);
3774 EVT VT = TLI->getValueType(DL, Val);
3775 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
3776 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
3778 getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy);
3780 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
3781 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
3784 // Add to the base cost if we know that the extracted element of a vector is
3785 // destined to be moved to and used in the integer register file.
3786 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
3787 RegisterFileMoveCost += 1;
3789 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
3792 InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
3793 const APInt &DemandedElts,
3796 assert(DemandedElts.getBitWidth() ==
3797 cast<FixedVectorType>(Ty)->getNumElements() &&
3798 "Vector size mismatch");
3800 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
3801 MVT MScalarTy = LT.second.getScalarType();
3802 unsigned SizeInBits = LT.second.getSizeInBits();
3804 InstructionCost Cost = 0;
3806 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
3807 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
3809 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
3810 (MScalarTy.isInteger() && ST->hasSSE41()) ||
3811 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
3812 // For types we can insert directly, insertion into 128-bit sub vectors is
3813 // cheap, followed by a cheap chain of concatenations.
3814 if (SizeInBits <= 128) {
3816 BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
3818 // In each 128-lane, if at least one index is demanded but not all
3819 // indices are demanded and this 128-lane is not the first 128-lane of
3820 // the legalized-vector, then this 128-lane needs a extracti128; If in
3821 // each 128-lane, there is at least one demanded index, this 128-lane
3822 // needs a inserti128.
3824 // The following cases will help you build a better understanding:
3825 // Assume we insert several elements into a v8i32 vector in avx2,
3826 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
3827 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
3829 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
3830 const int CostValue = *LT.first.getValue();
3831 assert(CostValue >= 0 && "Negative cost!");
3832 unsigned Num128Lanes = SizeInBits / 128 * CostValue;
3833 unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
3834 APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
3835 unsigned Scale = NumElts / Num128Lanes;
3836 // We iterate each 128-lane, and check if we need a
3837 // extracti128/inserti128 for this 128-lane.
3838 for (unsigned I = 0; I < NumElts; I += Scale) {
3839 APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
3840 APInt MaskedDE = Mask & WidenedDemandedElts;
3841 unsigned Population = MaskedDE.countPopulation();
3842 Cost += (Population > 0 && Population != Scale &&
3843 I % LT.second.getVectorNumElements() != 0);
3844 Cost += Population > 0;
3846 Cost += DemandedElts.countPopulation();
3848 // For vXf32 cases, insertion into the 0'th index in each v4f32
3849 // 128-bit vector is free.
3850 // NOTE: This assumes legalization widens vXf32 vectors.
3851 if (MScalarTy == MVT::f32)
3852 for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
3854 if (DemandedElts[i])
3857 } else if (LT.second.isVector()) {
3858 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
3859 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
3860 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
3861 // considered cheap.
3862 if (Ty->isIntOrIntVectorTy())
3863 Cost += DemandedElts.countPopulation();
3865 // Get the smaller of the legalized or original pow2-extended number of
3866 // vector elements, which represents the number of unpacks we'll end up
3868 unsigned NumElts = LT.second.getVectorNumElements();
3870 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
3871 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
3876 // vXi1 can be efficiently extracted with MOVMSK.
3877 // TODO: AVX512 predicate mask handling.
3878 // NOTE: This doesn't work well for roundtrip scalarization.
3879 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
3880 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
3881 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
3882 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
3886 if (LT.second.isVector()) {
3887 int CostValue = *LT.first.getValue();
3888 assert(CostValue >= 0 && "Negative cost!");
3890 unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
3891 assert(NumElts >= DemandedElts.getBitWidth() &&
3892 "Vector has been legalized to smaller element count");
3894 // If we're extracting elements from a 128-bit subvector lane, we only need
3895 // to extract each lane once, not for every element.
3896 if (SizeInBits > 128) {
3897 assert((SizeInBits % 128) == 0 && "Illegal vector");
3898 unsigned NumLegal128Lanes = SizeInBits / 128;
3899 unsigned Num128Lanes = NumLegal128Lanes * CostValue;
3900 APInt WidenedDemandedElts = DemandedElts.zext(NumElts);
3901 unsigned Scale = NumElts / Num128Lanes;
3903 // Add cost for each demanded 128-bit subvector extraction.
3904 // Luckily this is a lot easier than for insertion.
3905 APInt DemandedUpper128Lanes =
3906 APIntOps::ScaleBitMask(WidenedDemandedElts, Num128Lanes);
3907 auto *Ty128 = FixedVectorType::get(Ty->getElementType(), Scale);
3908 for (unsigned I = 0; I != Num128Lanes; ++I)
3909 if (DemandedUpper128Lanes[I])
3910 Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
3913 // Add all the demanded element extractions together, but adjust the
3914 // index to use the equivalent of the bottom 128 bit lane.
3915 for (unsigned I = 0; I != NumElts; ++I)
3916 if (WidenedDemandedElts[I]) {
3917 unsigned Idx = I % Scale;
3918 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, Idx);
3925 // Fallback to default extraction.
3926 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
3933 X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
3934 int VF, const APInt &DemandedDstElts,
3935 TTI::TargetCostKind CostKind) {
3936 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
3937 // We don't differentiate element types here, only element bit width.
3938 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
3940 auto bailout = [&]() {
3941 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
3942 DemandedDstElts, CostKind);
3945 // For now, only deal with AVX512 cases.
3946 if (!ST->hasAVX512())
3949 // Do we have a native shuffle for this element type, or should we promote?
3950 unsigned PromEltTyBits = EltTyBits;
3951 switch (EltTyBits) {
3957 PromEltTyBits = 32; // promote to i32, AVX512F.
3961 PromEltTyBits = 32; // promote to i32, AVX512F.
3962 break; // AVX512VBMI
3964 // There is no support for shuffling i1 elements. We *must* promote.
3967 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
3969 PromEltTyBits = 16; // promote to i16, AVX512BW.
3973 PromEltTyBits = 32; // promote to i32, AVX512F.
3980 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
3982 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
3983 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
3985 int NumDstElements = VF * ReplicationFactor;
3986 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
3987 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
3989 // Legalize the types.
3990 MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second;
3991 MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second;
3992 MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second;
3993 MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second;
3994 // They should have legalized into vector types.
3995 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
3996 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
3999 if (PromEltTyBits != EltTyBits) {
4000 // If we have to perform the shuffle with wider elt type than our data type,
4001 // then we will first need to anyext (we don't care about the new bits)
4002 // the source elements, and then truncate Dst elements.
4003 InstructionCost PromotionCost;
4004 PromotionCost += getCastInstrCost(
4005 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
4006 TargetTransformInfo::CastContextHint::None, CostKind);
4008 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
4009 /*Src=*/PromDstVecTy,
4010 TargetTransformInfo::CastContextHint::None, CostKind);
4011 return PromotionCost + getReplicationShuffleCost(PromEltTy,
4012 ReplicationFactor, VF,
4013 DemandedDstElts, CostKind);
4016 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
4017 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
4018 "We expect that the legalization doesn't affect the element width, "
4019 "doesn't coalesce/split elements.");
4021 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
4022 unsigned NumDstVectors =
4023 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
4025 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
4027 // Not all the produced Dst elements may be demanded. In our case,
4028 // given that a single Dst vector is formed by a single shuffle,
4029 // if all elements that will form a single Dst vector aren't demanded,
4030 // then we won't need to do that shuffle, so adjust the cost accordingly.
4031 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
4032 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
4033 unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation();
4035 InstructionCost SingleShuffleCost =
4036 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy,
4037 /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr);
4038 return NumDstVectorsDemanded * SingleShuffleCost;
4041 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
4042 MaybeAlign Alignment,
4043 unsigned AddressSpace,
4044 TTI::TargetCostKind CostKind,
4045 const Instruction *I) {
4046 // TODO: Handle other cost kinds.
4047 if (CostKind != TTI::TCK_RecipThroughput) {
4048 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
4049 // Store instruction with index and scale costs 2 Uops.
4050 // Check the preceding GEP to identify non-const indices.
4051 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
4052 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
4053 return TTI::TCC_Basic * 2;
4056 return TTI::TCC_Basic;
4059 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4061 // Type legalization can't handle structs
4062 if (TLI->getValueType(DL, Src, true) == MVT::Other)
4063 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4066 // Legalize the type.
4067 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
4069 auto *VTy = dyn_cast<FixedVectorType>(Src);
4071 // Handle the simple case of non-vectors.
4072 // NOTE: this assumes that legalization never creates vector from scalars!
4073 if (!VTy || !LT.second.isVector())
4074 // Each load/store unit costs 1.
4075 return LT.first * 1;
4077 bool IsLoad = Opcode == Instruction::Load;
4079 Type *EltTy = VTy->getElementType();
4081 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
4083 InstructionCost Cost = 0;
4085 // Source of truth: how many elements were there in the original IR vector?
4086 const unsigned SrcNumElt = VTy->getNumElements();
4088 // How far have we gotten?
4089 int NumEltRemaining = SrcNumElt;
4090 // Note that we intentionally capture by-reference, NumEltRemaining changes.
4091 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
4093 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
4095 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
4096 const unsigned XMMBits = 128;
4097 if (XMMBits % EltTyBits != 0)
4098 // Vector size must be a multiple of the element size. I.e. no padding.
4099 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4101 const int NumEltPerXMM = XMMBits / EltTyBits;
4103 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
4105 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
4106 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
4107 // How many elements would a single op deal with at once?
4108 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
4109 // Vector size must be a multiple of the element size. I.e. no padding.
4110 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4112 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
4114 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
4115 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
4116 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
4117 "Unless we haven't halved the op size yet, "
4118 "we have less than two op's sized units of work left.");
4120 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
4121 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
4124 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
4125 "After halving sizes, the vector elt count is no longer a multiple "
4126 "of number of elements per operation?");
4127 auto *CoalescedVecTy =
4128 CurrNumEltPerOp == 1
4130 : FixedVectorType::get(
4131 IntegerType::get(Src->getContext(),
4132 EltTyBits * CurrNumEltPerOp),
4133 CurrVecTy->getNumElements() / CurrNumEltPerOp);
4134 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
4135 DL.getTypeSizeInBits(CurrVecTy) &&
4136 "coalesciing elements doesn't change vector width.");
4138 while (NumEltRemaining > 0) {
4139 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
4141 // Can we use this vector size, as per the remaining element count?
4142 // Iff the vector is naturally aligned, we can do a wide load regardless.
4143 if (NumEltRemaining < CurrNumEltPerOp &&
4144 (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
4145 CurrOpSizeBytes != 1)
4146 break; // Try smalled vector size.
4148 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
4150 // If we have fully processed the previous reg, we need to replenish it.
4151 if (SubVecEltsLeft == 0) {
4152 SubVecEltsLeft += CurrVecTy->getNumElements();
4153 // And that's free only for the 0'th subvector of a legalized vector.
4155 Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
4156 : TTI::ShuffleKind::SK_ExtractSubvector,
4157 VTy, None, NumEltDone(), CurrVecTy);
4160 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
4161 // for smaller widths (32/16/8) we have to insert/extract them separately.
4162 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
4163 // but let's pretend that it is also true for 16/8 bit wide ops...)
4164 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
4165 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
4166 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
4167 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
4168 APInt DemandedElts =
4169 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
4170 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
4171 assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
4172 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
4176 // This isn't exactly right. We're using slow unaligned 32-byte accesses
4177 // as a proxy for a double-pumped AVX memory interface such as on
4179 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
4184 SubVecEltsLeft -= CurrNumEltPerOp;
4185 NumEltRemaining -= CurrNumEltPerOp;
4186 Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
4190 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
4196 X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
4197 unsigned AddressSpace,
4198 TTI::TargetCostKind CostKind) {
4199 bool IsLoad = (Instruction::Load == Opcode);
4200 bool IsStore = (Instruction::Store == Opcode);
4202 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
4204 // To calculate scalar take the regular cost, without mask
4205 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
4207 unsigned NumElem = SrcVTy->getNumElements();
4209 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
4210 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
4211 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
4213 APInt DemandedElts = APInt::getAllOnes(NumElem);
4214 InstructionCost MaskSplitCost =
4215 getScalarizationOverhead(MaskTy, DemandedElts, false, true);
4216 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
4217 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
4218 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4219 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
4220 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
4221 InstructionCost ValueSplitCost =
4222 getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
4223 InstructionCost MemopCost =
4224 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
4225 Alignment, AddressSpace, CostKind);
4226 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
4229 // Legalize the type.
4230 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
4231 auto VT = TLI->getValueType(DL, SrcVTy);
4232 InstructionCost Cost = 0;
4233 if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
4234 LT.second.getVectorNumElements() == NumElem)
4235 // Promotion requires extend/truncate for data and a shuffle for mask.
4236 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) +
4237 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr);
4239 else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
4240 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
4241 LT.second.getVectorNumElements());
4242 // Expanding requires fill mask with zeroes
4243 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy);
4246 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
4247 if (!ST->hasAVX512())
4248 return Cost + LT.first * (IsLoad ? 2 : 8);
4250 // AVX-512 masked load/store is cheapper
4251 return Cost + LT.first;
4254 InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
4255 ScalarEvolution *SE,
4257 // Address computations in vectorized code with non-consecutive addresses will
4258 // likely result in more instructions compared to scalar code where the
4259 // computation can more often be merged into the index mode. The resulting
4260 // extra micro-ops can significantly decrease throughput.
4261 const unsigned NumVectorInstToHideOverhead = 10;
4263 // Cost modeling of Strided Access Computation is hidden by the indexing
4264 // modes of X86 regardless of the stride value. We dont believe that there
4265 // is a difference between constant strided access in gerenal and constant
4266 // strided value which is less than or equal to 64.
4267 // Even in the case of (loop invariant) stride whose value is not known at
4268 // compile time, the address computation will not incur more than one extra
4270 if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
4271 // TODO: AVX2 is the current cut-off because we don't have correct
4272 // interleaving costs for prior ISA's.
4273 if (!BaseT::isStridedAccess(Ptr))
4274 return NumVectorInstToHideOverhead;
4275 if (!BaseT::getConstantStrideStep(SE, Ptr))
4279 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
4283 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
4284 Optional<FastMathFlags> FMF,
4285 TTI::TargetCostKind CostKind) {
4286 if (TTI::requiresOrderedReduction(FMF))
4287 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4289 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
4290 // and make it as the cost.
4292 static const CostTblEntry SLMCostTblNoPairWise[] = {
4293 { ISD::FADD, MVT::v2f64, 3 },
4294 { ISD::ADD, MVT::v2i64, 5 },
4297 static const CostTblEntry SSE2CostTblNoPairWise[] = {
4298 { ISD::FADD, MVT::v2f64, 2 },
4299 { ISD::FADD, MVT::v2f32, 2 },
4300 { ISD::FADD, MVT::v4f32, 4 },
4301 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
4302 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
4303 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
4304 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
4305 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
4306 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
4307 { ISD::ADD, MVT::v2i8, 2 },
4308 { ISD::ADD, MVT::v4i8, 2 },
4309 { ISD::ADD, MVT::v8i8, 2 },
4310 { ISD::ADD, MVT::v16i8, 3 },
4313 static const CostTblEntry AVX1CostTblNoPairWise[] = {
4314 { ISD::FADD, MVT::v4f64, 3 },
4315 { ISD::FADD, MVT::v4f32, 3 },
4316 { ISD::FADD, MVT::v8f32, 4 },
4317 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
4318 { ISD::ADD, MVT::v4i64, 3 },
4319 { ISD::ADD, MVT::v8i32, 5 },
4320 { ISD::ADD, MVT::v16i16, 5 },
4321 { ISD::ADD, MVT::v32i8, 4 },
4324 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4325 assert(ISD && "Invalid opcode");
4327 // Before legalizing the type, give a chance to look up illegal narrow types
4329 // FIXME: Is there a better way to do this?
4330 EVT VT = TLI->getValueType(DL, ValTy);
4331 if (VT.isSimple()) {
4332 MVT MTy = VT.getSimpleVT();
4333 if (ST->useSLMArithCosts())
4334 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
4338 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4342 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4346 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
4348 MVT MTy = LT.second;
4350 auto *ValVTy = cast<FixedVectorType>(ValTy);
4352 // Special case: vXi8 mul reductions are performed as vXi16.
4353 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
4354 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
4355 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
4356 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
4357 TargetTransformInfo::CastContextHint::None,
4359 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
4362 InstructionCost ArithmeticCost = 0;
4363 if (LT.first != 1 && MTy.isVector() &&
4364 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4365 // Type needs to be split. We need LT.first - 1 arithmetic ops.
4366 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
4367 MTy.getVectorNumElements());
4368 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
4369 ArithmeticCost *= LT.first - 1;
4372 if (ST->useSLMArithCosts())
4373 if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
4374 return ArithmeticCost + Entry->Cost;
4377 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4378 return ArithmeticCost + Entry->Cost;
4381 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4382 return ArithmeticCost + Entry->Cost;
4384 // FIXME: These assume a naive kshift+binop lowering, which is probably
4385 // conservative in most cases.
4386 static const CostTblEntry AVX512BoolReduction[] = {
4387 { ISD::AND, MVT::v2i1, 3 },
4388 { ISD::AND, MVT::v4i1, 5 },
4389 { ISD::AND, MVT::v8i1, 7 },
4390 { ISD::AND, MVT::v16i1, 9 },
4391 { ISD::AND, MVT::v32i1, 11 },
4392 { ISD::AND, MVT::v64i1, 13 },
4393 { ISD::OR, MVT::v2i1, 3 },
4394 { ISD::OR, MVT::v4i1, 5 },
4395 { ISD::OR, MVT::v8i1, 7 },
4396 { ISD::OR, MVT::v16i1, 9 },
4397 { ISD::OR, MVT::v32i1, 11 },
4398 { ISD::OR, MVT::v64i1, 13 },
4401 static const CostTblEntry AVX2BoolReduction[] = {
4402 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
4403 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
4404 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
4405 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
4408 static const CostTblEntry AVX1BoolReduction[] = {
4409 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
4410 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
4411 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
4412 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
4413 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
4414 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
4415 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
4416 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
4419 static const CostTblEntry SSE2BoolReduction[] = {
4420 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
4421 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
4422 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
4423 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
4424 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
4425 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
4426 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
4427 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
4430 // Handle bool allof/anyof patterns.
4431 if (ValVTy->getElementType()->isIntegerTy(1)) {
4432 InstructionCost ArithmeticCost = 0;
4433 if (LT.first != 1 && MTy.isVector() &&
4434 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4435 // Type needs to be split. We need LT.first - 1 arithmetic ops.
4436 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
4437 MTy.getVectorNumElements());
4438 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
4439 ArithmeticCost *= LT.first - 1;
4442 if (ST->hasAVX512())
4443 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
4444 return ArithmeticCost + Entry->Cost;
4446 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
4447 return ArithmeticCost + Entry->Cost;
4449 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
4450 return ArithmeticCost + Entry->Cost;
4452 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
4453 return ArithmeticCost + Entry->Cost;
4455 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
4458 unsigned NumVecElts = ValVTy->getNumElements();
4459 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
4461 // Special case power of 2 reductions where the scalar type isn't changed
4462 // by type legalization.
4463 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
4464 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
4466 InstructionCost ReductionCost = 0;
4469 if (LT.first != 1 && MTy.isVector() &&
4470 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4471 // Type needs to be split. We need LT.first - 1 arithmetic ops.
4472 Ty = FixedVectorType::get(ValVTy->getElementType(),
4473 MTy.getVectorNumElements());
4474 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4475 ReductionCost *= LT.first - 1;
4476 NumVecElts = MTy.getVectorNumElements();
4479 // Now handle reduction with the legal type, taking into account size changes
4481 while (NumVecElts > 1) {
4482 // Determine the size of the remaining vector we need to reduce.
4483 unsigned Size = NumVecElts * ScalarSize;
4485 // If we're reducing from 256/512 bits, use an extract_subvector.
4487 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
4489 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
4491 } else if (Size == 128) {
4492 // Reducing from 128 bits is a permute of v2f64/v2i64.
4493 FixedVectorType *ShufTy;
4494 if (ValVTy->isFloatingPointTy())
4496 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
4499 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
4501 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4502 } else if (Size == 64) {
4503 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
4504 FixedVectorType *ShufTy;
4505 if (ValVTy->isFloatingPointTy())
4507 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
4510 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
4512 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4514 // Reducing from smaller size is a shift by immediate.
4515 auto *ShiftTy = FixedVectorType::get(
4516 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
4517 ReductionCost += getArithmeticInstrCost(
4518 Instruction::LShr, ShiftTy, CostKind,
4519 TargetTransformInfo::OK_AnyValue,
4520 TargetTransformInfo::OK_UniformConstantValue,
4521 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
4524 // Add the arithmetic op for this level.
4525 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
4528 // Add the final extract element to the cost.
4529 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
4532 InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
4534 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
4536 MVT MTy = LT.second;
4539 if (Ty->isIntOrIntVectorTy()) {
4540 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
4542 assert(Ty->isFPOrFPVectorTy() &&
4543 "Expected float point or integer vector type.");
4547 static const CostTblEntry SSE1CostTbl[] = {
4548 {ISD::FMINNUM, MVT::v4f32, 1},
4551 static const CostTblEntry SSE2CostTbl[] = {
4552 {ISD::FMINNUM, MVT::v2f64, 1},
4553 {ISD::SMIN, MVT::v8i16, 1},
4554 {ISD::UMIN, MVT::v16i8, 1},
4557 static const CostTblEntry SSE41CostTbl[] = {
4558 {ISD::SMIN, MVT::v4i32, 1},
4559 {ISD::UMIN, MVT::v4i32, 1},
4560 {ISD::UMIN, MVT::v8i16, 1},
4561 {ISD::SMIN, MVT::v16i8, 1},
4564 static const CostTblEntry SSE42CostTbl[] = {
4565 {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
4568 static const CostTblEntry AVX1CostTbl[] = {
4569 {ISD::FMINNUM, MVT::v8f32, 1},
4570 {ISD::FMINNUM, MVT::v4f64, 1},
4571 {ISD::SMIN, MVT::v8i32, 3},
4572 {ISD::UMIN, MVT::v8i32, 3},
4573 {ISD::SMIN, MVT::v16i16, 3},
4574 {ISD::UMIN, MVT::v16i16, 3},
4575 {ISD::SMIN, MVT::v32i8, 3},
4576 {ISD::UMIN, MVT::v32i8, 3},
4579 static const CostTblEntry AVX2CostTbl[] = {
4580 {ISD::SMIN, MVT::v8i32, 1},
4581 {ISD::UMIN, MVT::v8i32, 1},
4582 {ISD::SMIN, MVT::v16i16, 1},
4583 {ISD::UMIN, MVT::v16i16, 1},
4584 {ISD::SMIN, MVT::v32i8, 1},
4585 {ISD::UMIN, MVT::v32i8, 1},
4588 static const CostTblEntry AVX512CostTbl[] = {
4589 {ISD::FMINNUM, MVT::v16f32, 1},
4590 {ISD::FMINNUM, MVT::v8f64, 1},
4591 {ISD::SMIN, MVT::v2i64, 1},
4592 {ISD::UMIN, MVT::v2i64, 1},
4593 {ISD::SMIN, MVT::v4i64, 1},
4594 {ISD::UMIN, MVT::v4i64, 1},
4595 {ISD::SMIN, MVT::v8i64, 1},
4596 {ISD::UMIN, MVT::v8i64, 1},
4597 {ISD::SMIN, MVT::v16i32, 1},
4598 {ISD::UMIN, MVT::v16i32, 1},
4601 static const CostTblEntry AVX512BWCostTbl[] = {
4602 {ISD::SMIN, MVT::v32i16, 1},
4603 {ISD::UMIN, MVT::v32i16, 1},
4604 {ISD::SMIN, MVT::v64i8, 1},
4605 {ISD::UMIN, MVT::v64i8, 1},
4608 // If we have a native MIN/MAX instruction for this type, use it.
4610 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4611 return LT.first * Entry->Cost;
4613 if (ST->hasAVX512())
4614 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4615 return LT.first * Entry->Cost;
4618 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4619 return LT.first * Entry->Cost;
4622 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4623 return LT.first * Entry->Cost;
4626 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4627 return LT.first * Entry->Cost;
4630 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4631 return LT.first * Entry->Cost;
4634 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4635 return LT.first * Entry->Cost;
4638 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4639 return LT.first * Entry->Cost;
4642 if (Ty->isFPOrFPVectorTy()) {
4643 CmpOpcode = Instruction::FCmp;
4645 assert(Ty->isIntOrIntVectorTy() &&
4646 "expecting floating point or integer type for min/max reduction");
4647 CmpOpcode = Instruction::ICmp;
4650 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4651 // Otherwise fall back to cmp+select.
4652 InstructionCost Result =
4653 getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
4655 getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
4656 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4661 X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
4663 TTI::TargetCostKind CostKind) {
4664 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
4666 MVT MTy = LT.second;
4669 if (ValTy->isIntOrIntVectorTy()) {
4670 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
4672 assert(ValTy->isFPOrFPVectorTy() &&
4673 "Expected float point or integer vector type.");
4677 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
4678 // and make it as the cost.
4680 static const CostTblEntry SSE2CostTblNoPairWise[] = {
4681 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
4682 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
4683 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
4686 static const CostTblEntry SSE41CostTblNoPairWise[] = {
4687 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
4688 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
4689 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
4690 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
4691 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
4692 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
4693 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
4694 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
4695 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
4696 {ISD::SMIN, MVT::v16i8, 6},
4697 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
4698 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
4699 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
4700 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
4703 static const CostTblEntry AVX1CostTblNoPairWise[] = {
4704 {ISD::SMIN, MVT::v16i16, 6},
4705 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
4706 {ISD::SMIN, MVT::v32i8, 8},
4707 {ISD::UMIN, MVT::v32i8, 8},
4710 static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
4711 {ISD::SMIN, MVT::v32i16, 8},
4712 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
4713 {ISD::SMIN, MVT::v64i8, 10},
4714 {ISD::UMIN, MVT::v64i8, 10},
4717 // Before legalizing the type, give a chance to look up illegal narrow types
4719 // FIXME: Is there a better way to do this?
4720 EVT VT = TLI->getValueType(DL, ValTy);
4721 if (VT.isSimple()) {
4722 MVT MTy = VT.getSimpleVT();
4724 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
4728 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4732 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
4736 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4740 auto *ValVTy = cast<FixedVectorType>(ValTy);
4741 unsigned NumVecElts = ValVTy->getNumElements();
4744 InstructionCost MinMaxCost = 0;
4745 if (LT.first != 1 && MTy.isVector() &&
4746 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
4747 // Type needs to be split. We need LT.first - 1 operations ops.
4748 Ty = FixedVectorType::get(ValVTy->getElementType(),
4749 MTy.getVectorNumElements());
4750 auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
4751 MTy.getVectorNumElements());
4752 MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
4753 MinMaxCost *= LT.first - 1;
4754 NumVecElts = MTy.getVectorNumElements();
4758 if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
4759 return MinMaxCost + Entry->Cost;
4762 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
4763 return MinMaxCost + Entry->Cost;
4766 if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
4767 return MinMaxCost + Entry->Cost;
4770 if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
4771 return MinMaxCost + Entry->Cost;
4773 unsigned ScalarSize = ValTy->getScalarSizeInBits();
4775 // Special case power of 2 reductions where the scalar type isn't changed
4776 // by type legalization.
4777 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
4778 ScalarSize != MTy.getScalarSizeInBits())
4779 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind);
4781 // Now handle reduction with the legal type, taking into account size changes
4783 while (NumVecElts > 1) {
4784 // Determine the size of the remaining vector we need to reduce.
4785 unsigned Size = NumVecElts * ScalarSize;
4787 // If we're reducing from 256/512 bits, use an extract_subvector.
4789 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
4791 getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
4793 } else if (Size == 128) {
4794 // Reducing from 128 bits is a permute of v2f64/v2i64.
4796 if (ValTy->isFloatingPointTy())
4798 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
4800 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
4802 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4803 } else if (Size == 64) {
4804 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
4805 FixedVectorType *ShufTy;
4806 if (ValTy->isFloatingPointTy())
4807 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
4809 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
4811 getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
4813 // Reducing from smaller size is a shift by immediate.
4814 auto *ShiftTy = FixedVectorType::get(
4815 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
4816 MinMaxCost += getArithmeticInstrCost(
4817 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
4818 TargetTransformInfo::OK_AnyValue,
4819 TargetTransformInfo::OK_UniformConstantValue,
4820 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
4823 // Add the arithmetic op for this level.
4825 FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
4826 MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
4829 // Add the final extract element to the cost.
4830 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
4833 /// Calculate the cost of materializing a 64-bit value. This helper
4834 /// method might only calculate a fraction of a larger immediate. Therefore it
4835 /// is valid to return a cost of ZERO.
4836 InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
4838 return TTI::TCC_Free;
4841 return TTI::TCC_Basic;
4843 return 2 * TTI::TCC_Basic;
4846 InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
4847 TTI::TargetCostKind CostKind) {
4848 assert(Ty->isIntegerTy());
4850 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4854 // Never hoist constants larger than 128bit, because this might lead to
4855 // incorrect code generation or assertions in codegen.
4856 // Fixme: Create a cost model for types larger than i128 once the codegen
4857 // issues have been fixed.
4859 return TTI::TCC_Free;
4862 return TTI::TCC_Free;
4864 // Sign-extend all constants to a multiple of 64-bit.
4866 if (BitSize % 64 != 0)
4867 ImmVal = Imm.sext(alignTo(BitSize, 64));
4869 // Split the constant into 64-bit chunks and calculate the cost for each
4871 InstructionCost Cost = 0;
4872 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
4873 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
4874 int64_t Val = Tmp.getSExtValue();
4875 Cost += getIntImmCost(Val);
4877 // We need at least one instruction to materialize the constant.
4878 return std::max<InstructionCost>(1, Cost);
4881 InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
4882 const APInt &Imm, Type *Ty,
4883 TTI::TargetCostKind CostKind,
4884 Instruction *Inst) {
4885 assert(Ty->isIntegerTy());
4887 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4888 // There is no cost model for constants with a bit size of 0. Return TCC_Free
4889 // here, so that constant hoisting will ignore this constant.
4891 return TTI::TCC_Free;
4893 unsigned ImmIdx = ~0U;
4896 return TTI::TCC_Free;
4897 case Instruction::GetElementPtr:
4898 // Always hoist the base address of a GetElementPtr. This prevents the
4899 // creation of new constants for every base constant that gets constant
4900 // folded with the offset.
4902 return 2 * TTI::TCC_Basic;
4903 return TTI::TCC_Free;
4904 case Instruction::Store:
4907 case Instruction::ICmp:
4908 // This is an imperfect hack to prevent constant hoisting of
4909 // compares that might be trying to check if a 64-bit value fits in
4910 // 32-bits. The backend can optimize these cases using a right shift by 32.
4911 // Ideally we would check the compare predicate here. There also other
4912 // similar immediates the backend can use shifts for.
4913 if (Idx == 1 && Imm.getBitWidth() == 64) {
4914 uint64_t ImmVal = Imm.getZExtValue();
4915 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
4916 return TTI::TCC_Free;
4920 case Instruction::And:
4921 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
4922 // by using a 32-bit operation with implicit zero extension. Detect such
4923 // immediates here as the normal path expects bit 31 to be sign extended.
4924 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
4925 return TTI::TCC_Free;
4928 case Instruction::Add:
4929 case Instruction::Sub:
4930 // For add/sub, we can use the opposite instruction for INT32_MIN.
4931 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
4932 return TTI::TCC_Free;
4935 case Instruction::UDiv:
4936 case Instruction::SDiv:
4937 case Instruction::URem:
4938 case Instruction::SRem:
4939 // Division by constant is typically expanded later into a different
4940 // instruction sequence. This completely changes the constants.
4941 // Report them as "free" to stop ConstantHoist from marking them as opaque.
4942 return TTI::TCC_Free;
4943 case Instruction::Mul:
4944 case Instruction::Or:
4945 case Instruction::Xor:
4948 // Always return TCC_Free for the shift value of a shift instruction.
4949 case Instruction::Shl:
4950 case Instruction::LShr:
4951 case Instruction::AShr:
4953 return TTI::TCC_Free;
4955 case Instruction::Trunc:
4956 case Instruction::ZExt:
4957 case Instruction::SExt:
4958 case Instruction::IntToPtr:
4959 case Instruction::PtrToInt:
4960 case Instruction::BitCast:
4961 case Instruction::PHI:
4962 case Instruction::Call:
4963 case Instruction::Select:
4964 case Instruction::Ret:
4965 case Instruction::Load:
4969 if (Idx == ImmIdx) {
4970 int NumConstants = divideCeil(BitSize, 64);
4971 InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4972 return (Cost <= NumConstants * TTI::TCC_Basic)
4973 ? static_cast<int>(TTI::TCC_Free)
4977 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4980 InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
4981 const APInt &Imm, Type *Ty,
4982 TTI::TargetCostKind CostKind) {
4983 assert(Ty->isIntegerTy());
4985 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4986 // There is no cost model for constants with a bit size of 0. Return TCC_Free
4987 // here, so that constant hoisting will ignore this constant.
4989 return TTI::TCC_Free;
4993 return TTI::TCC_Free;
4994 case Intrinsic::sadd_with_overflow:
4995 case Intrinsic::uadd_with_overflow:
4996 case Intrinsic::ssub_with_overflow:
4997 case Intrinsic::usub_with_overflow:
4998 case Intrinsic::smul_with_overflow:
4999 case Intrinsic::umul_with_overflow:
5000 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
5001 return TTI::TCC_Free;
5003 case Intrinsic::experimental_stackmap:
5004 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
5005 return TTI::TCC_Free;
5007 case Intrinsic::experimental_patchpoint_void:
5008 case Intrinsic::experimental_patchpoint_i64:
5009 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
5010 return TTI::TCC_Free;
5013 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
5016 InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
5017 TTI::TargetCostKind CostKind,
5018 const Instruction *I) {
5019 if (CostKind != TTI::TCK_RecipThroughput)
5020 return Opcode == Instruction::PHI ? 0 : 1;
5021 // Branches are assumed to be predicted.
5025 int X86TTIImpl::getGatherOverhead() const {
5026 // Some CPUs have more overhead for gather. The specified overhead is relative
5027 // to the Load operation. "2" is the number provided by Intel architects. This
5028 // parameter is used for cost estimation of Gather Op and comparison with
5029 // other alternatives.
5030 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
5031 // enable gather with a -march.
5032 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
5038 int X86TTIImpl::getScatterOverhead() const {
5039 if (ST->hasAVX512())
5045 // Return an average cost of Gather / Scatter instruction, maybe improved later.
5046 // FIXME: Add TargetCostKind support.
5047 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
5048 const Value *Ptr, Align Alignment,
5049 unsigned AddressSpace) {
5051 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
5052 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5054 // Try to reduce index size from 64 bit (default for GEP)
5055 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
5056 // operation will use 16 x 64 indices which do not fit in a zmm and needs
5057 // to split. Also check that the base pointer is the same for all lanes,
5058 // and that there's at most one variable index.
5059 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
5060 unsigned IndexSize = DL.getPointerSizeInBits();
5061 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5062 if (IndexSize < 64 || !GEP)
5065 unsigned NumOfVarIndices = 0;
5066 const Value *Ptrs = GEP->getPointerOperand();
5067 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
5069 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
5070 if (isa<Constant>(GEP->getOperand(i)))
5072 Type *IndxTy = GEP->getOperand(i)->getType();
5073 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
5074 IndxTy = IndexVTy->getElementType();
5075 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
5076 !isa<SExtInst>(GEP->getOperand(i))) ||
5077 ++NumOfVarIndices > 1)
5078 return IndexSize; // 64
5080 return (unsigned)32;
5083 // Trying to reduce IndexSize to 32 bits for vector 16.
5084 // By default the IndexSize is equal to pointer size.
5085 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
5086 ? getIndexSizeInBits(Ptr, DL)
5087 : DL.getPointerSizeInBits();
5089 auto *IndexVTy = FixedVectorType::get(
5090 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
5091 std::pair<InstructionCost, MVT> IdxsLT =
5092 TLI->getTypeLegalizationCost(DL, IndexVTy);
5093 std::pair<InstructionCost, MVT> SrcLT =
5094 TLI->getTypeLegalizationCost(DL, SrcVTy);
5095 InstructionCost::CostType SplitFactor =
5096 *std::max(IdxsLT.first, SrcLT.first).getValue();
5097 if (SplitFactor > 1) {
5098 // Handle splitting of vector of pointers
5100 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
5101 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
5105 // The gather / scatter cost is given by Intel architects. It is a rough
5106 // number since we are looking at one instruction in a time.
5107 const int GSOverhead = (Opcode == Instruction::Load)
5108 ? getGatherOverhead()
5109 : getScatterOverhead();
5110 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5111 MaybeAlign(Alignment), AddressSpace,
5112 TTI::TCK_RecipThroughput);
5115 /// Return the cost of full scalarization of gather / scatter operation.
5117 /// Opcode - Load or Store instruction.
5118 /// SrcVTy - The type of the data vector that should be gathered or scattered.
5119 /// VariableMask - The mask is non-constant at compile time.
5120 /// Alignment - Alignment for one element.
5121 /// AddressSpace - pointer[s] address space.
5123 /// FIXME: Add TargetCostKind support.
5124 InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
5125 bool VariableMask, Align Alignment,
5126 unsigned AddressSpace) {
5127 Type *ScalarTy = SrcVTy->getScalarType();
5128 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
5129 APInt DemandedElts = APInt::getAllOnes(VF);
5130 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5132 InstructionCost MaskUnpackCost = 0;
5135 FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
5136 MaskUnpackCost = getScalarizationOverhead(
5137 MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true);
5138 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5139 Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5140 CmpInst::BAD_ICMP_PREDICATE, CostKind);
5141 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5142 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5145 InstructionCost AddressUnpackCost = getScalarizationOverhead(
5146 FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts,
5147 /*Insert=*/false, /*Extract=*/true);
5149 // The cost of the scalar loads/stores.
5150 InstructionCost MemoryOpCost =
5151 VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
5152 AddressSpace, CostKind);
5154 // The cost of forming the vector from loaded scalars/
5155 // scalarizing the vector to perform scalar stores.
5156 InstructionCost InsertExtractCost =
5157 getScalarizationOverhead(cast<FixedVectorType>(SrcVTy), DemandedElts,
5158 /*Insert=*/Opcode == Instruction::Load,
5159 /*Extract=*/Opcode == Instruction::Store);
5161 return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
5164 /// Calculate the cost of Gather / Scatter operation
5165 InstructionCost X86TTIImpl::getGatherScatterOpCost(
5166 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
5167 Align Alignment, TTI::TargetCostKind CostKind,
5168 const Instruction *I = nullptr) {
5169 if (CostKind != TTI::TCK_RecipThroughput) {
5170 if ((Opcode == Instruction::Load &&
5171 isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
5172 !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5173 Align(Alignment))) ||
5174 (Opcode == Instruction::Store &&
5175 isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
5176 !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5179 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
5180 Alignment, CostKind, I);
5183 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
5184 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5185 if (!PtrTy && Ptr->getType()->isVectorTy())
5186 PtrTy = dyn_cast<PointerType>(
5187 cast<VectorType>(Ptr->getType())->getElementType());
5188 assert(PtrTy && "Unexpected type for Ptr argument");
5189 unsigned AddressSpace = PtrTy->getAddressSpace();
5191 if ((Opcode == Instruction::Load &&
5192 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
5193 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
5194 Align(Alignment)))) ||
5195 (Opcode == Instruction::Store &&
5196 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
5197 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
5198 Align(Alignment)))))
5199 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
5202 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
5205 bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
5206 const TargetTransformInfo::LSRCost &C2) {
5207 // X86 specific here are "instruction number 1st priority".
5208 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
5209 C1.NumIVMuls, C1.NumBaseAdds,
5210 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5211 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
5212 C2.NumIVMuls, C2.NumBaseAdds,
5213 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5216 bool X86TTIImpl::canMacroFuseCmp() {
5217 return ST->hasMacroFusion() || ST->hasBranchFusion();
5220 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
5224 // The backend can't handle a single element vector.
5225 if (isa<VectorType>(DataTy) &&
5226 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5228 Type *ScalarTy = DataTy->getScalarType();
5230 if (ScalarTy->isPointerTy())
5233 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5236 if (ScalarTy->isHalfTy() && ST->hasBWI())
5239 if (!ScalarTy->isIntegerTy())
5242 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5243 return IntWidth == 32 || IntWidth == 64 ||
5244 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
5247 bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
5248 return isLegalMaskedLoad(DataType, Alignment);
5251 bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
5252 unsigned DataSize = DL.getTypeStoreSize(DataType);
5253 // The only supported nontemporal loads are for aligned vectors of 16 or 32
5254 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
5255 // (the equivalent stores only require AVX).
5256 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
5257 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
5262 bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
5263 unsigned DataSize = DL.getTypeStoreSize(DataType);
5265 // SSE4A supports nontemporal stores of float and double at arbitrary
5267 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
5270 // Besides the SSE4A subtarget exception above, only aligned stores are
5271 // available nontemporaly on any other subtarget. And only stores with a size
5272 // of 4..32 bytes (powers of 2, only) are permitted.
5273 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
5274 !isPowerOf2_32(DataSize))
5277 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
5278 // loads require AVX2).
5280 return ST->hasAVX();
5282 return ST->hasSSE1();
5286 bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
5287 ElementCount NumElements) const {
5289 return ST->hasSSE3() && !NumElements.isScalable() &&
5290 NumElements.getFixedValue() == 2 &&
5291 ElementTy == Type::getDoubleTy(ElementTy->getContext());
5294 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
5295 if (!isa<VectorType>(DataTy))
5298 if (!ST->hasAVX512())
5301 // The backend can't handle a single element vector.
5302 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
5305 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
5307 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5310 if (!ScalarTy->isIntegerTy())
5313 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5314 return IntWidth == 32 || IntWidth == 64 ||
5315 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
5318 bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
5319 return isLegalMaskedExpandLoad(DataTy);
5322 bool X86TTIImpl::supportsGather() const {
5323 // Some CPUs have better gather performance than others.
5324 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
5325 // enable gather with a -march.
5326 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
5329 bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
5330 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
5331 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
5332 // it to 8 elements, but zeroing upper bits of the mask vector will add more
5333 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
5334 // Check, maybe the gather/scatter instruction is better in the VariableMask
5336 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
5337 return NumElts == 1 ||
5338 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
5341 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
5342 if (!supportsGather())
5344 Type *ScalarTy = DataTy->getScalarType();
5345 if (ScalarTy->isPointerTy())
5348 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
5351 if (!ScalarTy->isIntegerTy())
5354 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
5355 return IntWidth == 32 || IntWidth == 64;
5358 bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
5360 const SmallBitVector &OpcodeMask) const {
5361 // ADDSUBPS 4xf32 SSE3
5362 // VADDSUBPS 4xf32 AVX
5363 // VADDSUBPS 8xf32 AVX2
5364 // ADDSUBPD 2xf64 SSE3
5365 // VADDSUBPD 2xf64 AVX
5366 // VADDSUBPD 4xf64 AVX2
5368 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
5369 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
5370 if (!isPowerOf2_32(NumElements))
5372 // Check the opcode pattern. We apply the mask on the opcode arguments and
5373 // then check if it is what we expect.
5374 for (int Lane : seq<int>(0, NumElements)) {
5375 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
5376 // We expect FSub for even lanes and FAdd for odd lanes.
5377 if (Lane % 2 == 0 && Opc != Instruction::FSub)
5379 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
5382 // Now check that the pattern is supported by the target ISA.
5383 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
5384 if (ElemTy->isFloatTy())
5385 return ST->hasSSE3() && NumElements % 4 == 0;
5386 if (ElemTy->isDoubleTy())
5387 return ST->hasSSE3() && NumElements % 2 == 0;
5391 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
5392 // AVX2 doesn't support scatter
5393 if (!ST->hasAVX512())
5395 return isLegalMaskedGather(DataType, Alignment);
5398 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
5399 EVT VT = TLI->getValueType(DL, DataType);
5400 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
5403 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
5407 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
5408 const Function *Callee) const {
5409 const TargetMachine &TM = getTLI()->getTargetMachine();
5411 // Work this as a subsetting of subtarget features.
5412 const FeatureBitset &CallerBits =
5413 TM.getSubtargetImpl(*Caller)->getFeatureBits();
5414 const FeatureBitset &CalleeBits =
5415 TM.getSubtargetImpl(*Callee)->getFeatureBits();
5417 // Check whether features are the same (apart from the ignore list).
5418 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
5419 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
5420 if (RealCallerBits == RealCalleeBits)
5423 // If the features are a subset, we need to additionally check for calls
5424 // that may become ABI-incompatible as a result of inlining.
5425 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
5428 for (const Instruction &I : instructions(Callee)) {
5429 if (const auto *CB = dyn_cast<CallBase>(&I)) {
5430 SmallVector<Type *, 8> Types;
5431 for (Value *Arg : CB->args())
5432 Types.push_back(Arg->getType());
5433 if (!CB->getType()->isVoidTy())
5434 Types.push_back(CB->getType());
5436 // Simple types are always ABI compatible.
5437 auto IsSimpleTy = [](Type *Ty) {
5438 return !Ty->isVectorTy() && !Ty->isAggregateType();
5440 if (all_of(Types, IsSimpleTy))
5443 if (Function *NestedCallee = CB->getCalledFunction()) {
5444 // Assume that intrinsics are always ABI compatible.
5445 if (NestedCallee->isIntrinsic())
5448 // Do a precise compatibility check.
5449 if (!areTypesABICompatible(Caller, NestedCallee, Types))
5452 // We don't know the target features of the callee,
5453 // assume it is incompatible.
5461 bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
5462 const Function *Callee,
5463 const ArrayRef<Type *> &Types) const {
5464 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
5467 // If we get here, we know the target features match. If one function
5468 // considers 512-bit vectors legal and the other does not, consider them
5470 const TargetMachine &TM = getTLI()->getTargetMachine();
5472 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
5473 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
5476 // Consider the arguments compatible if they aren't vectors or aggregates.
5477 // FIXME: Look at the size of vectors.
5478 // FIXME: Look at the element types of aggregates to see if there are vectors.
5479 return llvm::none_of(Types,
5480 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
5483 X86TTIImpl::TTI::MemCmpExpansionOptions
5484 X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
5485 TTI::MemCmpExpansionOptions Options;
5486 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
5487 Options.NumLoadsPerBlock = 2;
5488 // All GPR and vector loads can be unaligned.
5489 Options.AllowOverlappingLoads = true;
5491 // Only enable vector loads for equality comparison. Right now the vector
5492 // version is not as fast for three way compare (see #33329).
5493 const unsigned PreferredWidth = ST->getPreferVectorWidth();
5494 if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
5495 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
5496 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
5498 if (ST->is64Bit()) {
5499 Options.LoadSizes.push_back(8);
5501 Options.LoadSizes.push_back(4);
5502 Options.LoadSizes.push_back(2);
5503 Options.LoadSizes.push_back(1);
5507 bool X86TTIImpl::prefersVectorizedAddressing() const {
5508 return supportsGather();
5511 bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
5515 bool X86TTIImpl::enableInterleavedAccessVectorization() {
5516 // TODO: We expect this to be beneficial regardless of arch,
5517 // but there are currently some unexplained performance artifacts on Atom.
5518 // As a temporary solution, disable on Atom.
5519 return !(ST->isAtom());
5522 // Get estimation for interleaved load/store operations and strided load.
5523 // \p Indices contains indices for strided load.
5524 // \p Factor - the factor of interleaving.
5525 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
5526 InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
5527 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
5528 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
5529 TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
5530 // VecTy for interleave memop is <VF*Factor x Elt>.
5531 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
5532 // VecTy = <12 x i32>.
5534 // Calculate the number of memory operations (NumOfMemOps), required
5535 // for load/store the VecTy.
5536 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
5537 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
5538 unsigned LegalVTSize = LegalVT.getStoreSize();
5539 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
5541 // Get the cost of one memory operation.
5542 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
5543 LegalVT.getVectorNumElements());
5544 InstructionCost MemOpCost;
5545 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
5547 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
5548 AddressSpace, CostKind);
5550 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
5551 AddressSpace, CostKind);
5553 unsigned VF = VecTy->getNumElements() / Factor;
5554 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
5556 InstructionCost MaskCost;
5557 if (UseMaskedMemOp) {
5558 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
5559 for (unsigned Index : Indices) {
5560 assert(Index < Factor && "Invalid index for interleaved memory op");
5561 for (unsigned Elm = 0; Elm < VF; Elm++)
5562 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
5565 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
5567 MaskCost = getReplicationShuffleCost(
5569 UseMaskForGaps ? DemandedLoadStoreElts
5570 : APInt::getAllOnes(VecTy->getNumElements()),
5573 // The Gaps mask is invariant and created outside the loop, therefore the
5574 // cost of creating it is not accounted for here. However if we have both
5575 // a MaskForGaps and some other mask that guards the execution of the
5576 // memory access, we need to account for the cost of And-ing the two masks
5578 if (UseMaskForGaps) {
5579 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
5580 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
5584 if (Opcode == Instruction::Load) {
5585 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
5586 // contain the cost of the optimized shuffle sequence that the
5587 // X86InterleavedAccess pass will generate.
5588 // The cost of loads and stores are computed separately from the table.
5590 // X86InterleavedAccess support only the following interleaved-access group.
5591 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
5592 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
5593 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
5594 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
5597 if (const auto *Entry =
5598 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
5599 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
5600 //If an entry does not exist, fallback to the default implementation.
5602 // Kind of shuffle depends on number of loaded values.
5603 // If we load the entire data in one register, we can use a 1-src shuffle.
5604 // Otherwise, we'll merge 2 sources in each operation.
5605 TTI::ShuffleKind ShuffleKind =
5606 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
5608 InstructionCost ShuffleCost =
5609 getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr);
5611 unsigned NumOfLoadsInInterleaveGrp =
5612 Indices.size() ? Indices.size() : Factor;
5613 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
5614 VecTy->getNumElements() / Factor);
5615 InstructionCost NumOfResults =
5616 getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
5617 NumOfLoadsInInterleaveGrp;
5619 // About a half of the loads may be folded in shuffles when we have only
5620 // one result. If we have more than one result, or the loads are masked,
5621 // we do not fold loads at all.
5622 unsigned NumOfUnfoldedLoads =
5623 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
5625 // Get a number of shuffle operations per result.
5626 unsigned NumOfShufflesPerResult =
5627 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
5629 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
5630 // When we have more than one destination, we need additional instructions
5632 InstructionCost NumOfMoves = 0;
5633 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
5634 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
5636 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
5637 MaskCost + NumOfUnfoldedLoads * MemOpCost +
5644 assert(Opcode == Instruction::Store &&
5645 "Expected Store Instruction at this point");
5646 // X86InterleavedAccess support only the following interleaved-access group.
5647 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
5648 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
5649 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
5650 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
5652 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
5653 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
5654 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
5655 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
5658 if (const auto *Entry =
5659 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
5660 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
5661 //If an entry does not exist, fallback to the default implementation.
5663 // There is no strided stores meanwhile. And store can't be folded in
5665 unsigned NumOfSources = Factor; // The number of values to be merged.
5666 InstructionCost ShuffleCost =
5667 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr);
5668 unsigned NumOfShufflesPerStore = NumOfSources - 1;
5670 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
5671 // We need additional instructions to keep sources.
5672 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
5673 InstructionCost Cost =
5675 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
5680 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
5681 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
5682 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5683 bool UseMaskForCond, bool UseMaskForGaps) {
5684 auto *VecTy = cast<FixedVectorType>(BaseTy);
5686 auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
5687 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
5688 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
5689 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
5691 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
5695 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
5696 return getInterleavedMemoryOpCostAVX512(
5697 Opcode, VecTy, Factor, Indices, Alignment,
5698 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
5700 if (UseMaskForCond || UseMaskForGaps)
5701 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5702 Alignment, AddressSpace, CostKind,
5703 UseMaskForCond, UseMaskForGaps);
5705 // Get estimation for interleaved load/store operations for SSE-AVX2.
5706 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
5707 // computing the cost using a generic formula as a function of generic
5708 // shuffles. We therefore use a lookup table instead, filled according to
5709 // the instruction sequences that codegen currently generates.
5711 // VecTy for interleave memop is <VF*Factor x Elt>.
5712 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
5713 // VecTy = <12 x i32>.
5714 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
5716 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
5717 // the VF=2, while v2i128 is an unsupported MVT vector type
5718 // (see MachineValueType.h::getVectorVT()).
5719 if (!LegalVT.isVector())
5720 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5721 Alignment, AddressSpace, CostKind);
5723 unsigned VF = VecTy->getNumElements() / Factor;
5724 Type *ScalarTy = VecTy->getElementType();
5725 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
5726 if (!ScalarTy->isIntegerTy())
5728 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
5730 // Get the cost of all the memory operations.
5731 // FIXME: discount dead loads.
5732 InstructionCost MemOpCosts = getMemoryOpCost(
5733 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
5735 auto *VT = FixedVectorType::get(ScalarTy, VF);
5736 EVT ETy = TLI->getValueType(DL, VT);
5737 if (!ETy.isSimple())
5738 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5739 Alignment, AddressSpace, CostKind);
5741 // TODO: Complete for other data-types and strides.
5742 // Each combination of Stride, element bit width and VF results in a different
5743 // sequence; The cost tables are therefore accessed with:
5744 // Factor (stride) and VectorType=VFxiN.
5745 // The Cost accounts only for the shuffle sequence;
5746 // The cost of the loads/stores is accounted for separately.
5748 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
5749 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
5750 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
5751 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
5752 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
5753 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
5755 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
5756 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
5757 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
5759 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
5760 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
5761 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
5763 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
5764 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
5765 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
5766 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
5768 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
5769 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
5770 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
5771 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
5772 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
5774 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
5775 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
5776 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
5777 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
5778 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
5780 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
5781 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
5782 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
5783 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
5784 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
5786 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
5787 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
5788 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
5789 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
5791 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
5792 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
5793 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
5794 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
5795 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
5797 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
5798 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
5799 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
5800 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
5801 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
5803 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
5804 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
5805 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
5806 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
5807 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
5809 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
5810 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
5811 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
5812 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
5814 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
5815 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
5816 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
5817 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
5818 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
5820 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
5821 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
5822 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
5823 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
5824 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
5826 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
5827 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
5828 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
5829 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
5831 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
5832 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
5833 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
5835 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
5838 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
5839 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
5842 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
5843 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
5844 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
5846 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
5847 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
5849 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
5852 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
5853 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
5854 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
5856 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
5857 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
5858 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
5860 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
5861 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
5862 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
5863 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
5865 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
5866 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
5867 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
5868 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
5869 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
5871 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
5872 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
5873 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
5874 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
5875 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
5877 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
5878 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
5879 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
5880 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
5881 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
5883 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
5884 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
5885 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
5886 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
5887 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
5889 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
5890 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
5891 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
5892 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
5894 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
5895 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
5896 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
5897 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
5898 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
5900 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
5901 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
5902 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
5903 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
5904 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
5906 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
5907 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
5908 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
5909 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
5910 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
5912 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
5913 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
5914 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
5915 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
5917 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
5918 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
5919 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
5920 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
5921 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
5923 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
5924 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
5925 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
5926 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
5927 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
5929 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
5930 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
5931 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
5932 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
5934 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
5935 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
5936 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
5939 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
5940 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
5941 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
5942 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
5944 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
5945 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
5947 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
5950 if (Opcode == Instruction::Load) {
5951 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
5952 MemOpCosts](const CostTblEntry *Entry) {
5953 // NOTE: this is just an approximation!
5954 // It can over/under -estimate the cost!
5955 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
5959 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
5961 return GetDiscountedCost(Entry);
5964 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
5966 return GetDiscountedCost(Entry);
5969 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
5971 return GetDiscountedCost(Entry);
5973 assert(Opcode == Instruction::Store &&
5974 "Expected Store Instruction at this point");
5975 assert((!Indices.size() || Indices.size() == Factor) &&
5976 "Interleaved store only supports fully-interleaved groups.");
5978 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
5980 return MemOpCosts + Entry->Cost;
5983 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
5985 return MemOpCosts + Entry->Cost;
5988 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5989 Alignment, AddressSpace, CostKind,
5990 UseMaskForCond, UseMaskForGaps);