1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "AArch64TargetTransformInfo.h"
10 #include "AArch64ExpandImm.h"
11 #include "MCTargetDesc/AArch64AddressingModes.h"
12 #include "llvm/Analysis/IVDescriptors.h"
13 #include "llvm/Analysis/LoopInfo.h"
14 #include "llvm/Analysis/TargetTransformInfo.h"
15 #include "llvm/CodeGen/BasicTTIImpl.h"
16 #include "llvm/CodeGen/CostTable.h"
17 #include "llvm/CodeGen/TargetLowering.h"
18 #include "llvm/IR/Intrinsics.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/IntrinsicsAArch64.h"
21 #include "llvm/IR/PatternMatch.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Transforms/InstCombine/InstCombiner.h"
26 using namespace llvm::PatternMatch;
28 #define DEBUG_TYPE "aarch64tti"
30 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
31 cl::init(true), cl::Hidden);
33 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
36 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
37 cl::init(10), cl::Hidden);
39 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
40 const Function *Callee) const {
41 const TargetMachine &TM = getTLI()->getTargetMachine();
43 const FeatureBitset &CallerBits =
44 TM.getSubtargetImpl(*Caller)->getFeatureBits();
45 const FeatureBitset &CalleeBits =
46 TM.getSubtargetImpl(*Callee)->getFeatureBits();
48 // Inline a callee if its target-features are a subset of the callers
50 return (CallerBits & CalleeBits) == CalleeBits;
53 /// Calculate the cost of materializing a 64-bit value. This helper
54 /// method might only calculate a fraction of a larger immediate. Therefore it
55 /// is valid to return a cost of ZERO.
56 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
57 // Check if the immediate can be encoded within an instruction.
58 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
64 // Calculate how many moves we will need to materialize this constant.
65 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
66 AArch64_IMM::expandMOVImm(Val, 64, Insn);
70 /// Calculate the cost of materializing the given constant.
71 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
72 TTI::TargetCostKind CostKind) {
73 assert(Ty->isIntegerTy());
75 unsigned BitSize = Ty->getPrimitiveSizeInBits();
79 // Sign-extend all constants to a multiple of 64-bit.
82 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
84 // Split the constant into 64-bit chunks and calculate the cost for each
86 InstructionCost Cost = 0;
87 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
88 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
89 int64_t Val = Tmp.getSExtValue();
90 Cost += getIntImmCost(Val);
92 // We need at least one instruction to materialze the constant.
93 return std::max<InstructionCost>(1, Cost);
96 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
97 const APInt &Imm, Type *Ty,
98 TTI::TargetCostKind CostKind,
100 assert(Ty->isIntegerTy());
102 unsigned BitSize = Ty->getPrimitiveSizeInBits();
103 // There is no cost model for constants with a bit size of 0. Return TCC_Free
104 // here, so that constant hoisting will ignore this constant.
106 return TTI::TCC_Free;
108 unsigned ImmIdx = ~0U;
111 return TTI::TCC_Free;
112 case Instruction::GetElementPtr:
113 // Always hoist the base address of a GetElementPtr.
115 return 2 * TTI::TCC_Basic;
116 return TTI::TCC_Free;
117 case Instruction::Store:
120 case Instruction::Add:
121 case Instruction::Sub:
122 case Instruction::Mul:
123 case Instruction::UDiv:
124 case Instruction::SDiv:
125 case Instruction::URem:
126 case Instruction::SRem:
127 case Instruction::And:
128 case Instruction::Or:
129 case Instruction::Xor:
130 case Instruction::ICmp:
133 // Always return TCC_Free for the shift value of a shift instruction.
134 case Instruction::Shl:
135 case Instruction::LShr:
136 case Instruction::AShr:
138 return TTI::TCC_Free;
140 case Instruction::Trunc:
141 case Instruction::ZExt:
142 case Instruction::SExt:
143 case Instruction::IntToPtr:
144 case Instruction::PtrToInt:
145 case Instruction::BitCast:
146 case Instruction::PHI:
147 case Instruction::Call:
148 case Instruction::Select:
149 case Instruction::Ret:
150 case Instruction::Load:
155 int NumConstants = (BitSize + 63) / 64;
156 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
157 return (Cost <= NumConstants * TTI::TCC_Basic)
158 ? static_cast<int>(TTI::TCC_Free)
161 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
165 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
166 const APInt &Imm, Type *Ty,
167 TTI::TargetCostKind CostKind) {
168 assert(Ty->isIntegerTy());
170 unsigned BitSize = Ty->getPrimitiveSizeInBits();
171 // There is no cost model for constants with a bit size of 0. Return TCC_Free
172 // here, so that constant hoisting will ignore this constant.
174 return TTI::TCC_Free;
176 // Most (all?) AArch64 intrinsics do not support folding immediates into the
177 // selected instruction, so we compute the materialization cost for the
178 // immediate directly.
179 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
180 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
184 return TTI::TCC_Free;
185 case Intrinsic::sadd_with_overflow:
186 case Intrinsic::uadd_with_overflow:
187 case Intrinsic::ssub_with_overflow:
188 case Intrinsic::usub_with_overflow:
189 case Intrinsic::smul_with_overflow:
190 case Intrinsic::umul_with_overflow:
192 int NumConstants = (BitSize + 63) / 64;
193 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
194 return (Cost <= NumConstants * TTI::TCC_Basic)
195 ? static_cast<int>(TTI::TCC_Free)
199 case Intrinsic::experimental_stackmap:
200 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
201 return TTI::TCC_Free;
203 case Intrinsic::experimental_patchpoint_void:
204 case Intrinsic::experimental_patchpoint_i64:
205 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
206 return TTI::TCC_Free;
208 case Intrinsic::experimental_gc_statepoint:
209 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
210 return TTI::TCC_Free;
213 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
216 TargetTransformInfo::PopcntSupportKind
217 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
218 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
219 if (TyWidth == 32 || TyWidth == 64)
220 return TTI::PSK_FastHardware;
221 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
222 return TTI::PSK_Software;
226 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
227 TTI::TargetCostKind CostKind) {
228 auto *RetTy = ICA.getReturnType();
229 switch (ICA.getID()) {
230 case Intrinsic::umin:
231 case Intrinsic::umax:
232 case Intrinsic::smin:
233 case Intrinsic::smax: {
234 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
235 MVT::v8i16, MVT::v2i32, MVT::v4i32};
236 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
237 // v2i64 types get converted to cmp+bif hence the cost of 2
238 if (LT.second == MVT::v2i64)
240 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }))
244 case Intrinsic::sadd_sat:
245 case Intrinsic::ssub_sat:
246 case Intrinsic::uadd_sat:
247 case Intrinsic::usub_sat: {
248 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
249 MVT::v8i16, MVT::v2i32, MVT::v4i32,
251 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
252 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
253 // need to extend the type, as it uses shr(qadd(shl, shl)).
255 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
256 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; }))
257 return LT.first * Instrs;
260 case Intrinsic::abs: {
261 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
262 MVT::v8i16, MVT::v2i32, MVT::v4i32,
264 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
265 if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; }))
269 case Intrinsic::experimental_stepvector: {
270 InstructionCost Cost = 1; // Cost of the `index' instruction
271 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
272 // Legalisation of illegal vectors involves an `index' instruction plus
273 // (LT.first - 1) vector adds.
275 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
276 InstructionCost AddCost =
277 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
278 Cost += AddCost * (LT.first - 1);
282 case Intrinsic::bitreverse: {
283 static const CostTblEntry BitreverseTbl[] = {
284 {Intrinsic::bitreverse, MVT::i32, 1},
285 {Intrinsic::bitreverse, MVT::i64, 1},
286 {Intrinsic::bitreverse, MVT::v8i8, 1},
287 {Intrinsic::bitreverse, MVT::v16i8, 1},
288 {Intrinsic::bitreverse, MVT::v4i16, 2},
289 {Intrinsic::bitreverse, MVT::v8i16, 2},
290 {Intrinsic::bitreverse, MVT::v2i32, 2},
291 {Intrinsic::bitreverse, MVT::v4i32, 2},
292 {Intrinsic::bitreverse, MVT::v1i64, 2},
293 {Intrinsic::bitreverse, MVT::v2i64, 2},
295 const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
297 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
299 // Cost Model is using the legal type(i32) that i8 and i16 will be
300 // converted to +1 so that we match the actual lowering cost
301 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
302 TLI->getValueType(DL, RetTy, true) == MVT::i16)
303 return LegalisationCost.first * Entry->Cost + 1;
305 return LegalisationCost.first * Entry->Cost;
309 case Intrinsic::ctpop: {
310 static const CostTblEntry CtpopCostTbl[] = {
311 {ISD::CTPOP, MVT::v2i64, 4},
312 {ISD::CTPOP, MVT::v4i32, 3},
313 {ISD::CTPOP, MVT::v8i16, 2},
314 {ISD::CTPOP, MVT::v16i8, 1},
315 {ISD::CTPOP, MVT::i64, 4},
316 {ISD::CTPOP, MVT::v2i32, 3},
317 {ISD::CTPOP, MVT::v4i16, 2},
318 {ISD::CTPOP, MVT::v8i8, 1},
319 {ISD::CTPOP, MVT::i32, 5},
321 auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
323 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
324 // Extra cost of +1 when illegal vector types are legalized by promoting
326 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
327 RetTy->getScalarSizeInBits()
330 return LT.first * Entry->Cost + ExtraCost;
334 case Intrinsic::sadd_with_overflow:
335 case Intrinsic::uadd_with_overflow:
336 case Intrinsic::ssub_with_overflow:
337 case Intrinsic::usub_with_overflow:
338 case Intrinsic::smul_with_overflow:
339 case Intrinsic::umul_with_overflow: {
340 static const CostTblEntry WithOverflowCostTbl[] = {
341 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
342 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
343 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
344 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
345 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
346 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
347 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
348 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
349 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
350 {Intrinsic::usub_with_overflow, MVT::i8, 3},
351 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
352 {Intrinsic::usub_with_overflow, MVT::i16, 3},
353 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
354 {Intrinsic::usub_with_overflow, MVT::i32, 1},
355 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
356 {Intrinsic::usub_with_overflow, MVT::i64, 1},
357 {Intrinsic::smul_with_overflow, MVT::i8, 5},
358 {Intrinsic::umul_with_overflow, MVT::i8, 4},
359 {Intrinsic::smul_with_overflow, MVT::i16, 5},
360 {Intrinsic::umul_with_overflow, MVT::i16, 4},
361 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
362 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
363 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
364 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
366 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
368 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
376 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
379 /// The function will remove redundant reinterprets casting in the presence
380 /// of the control flow
381 static Optional<Instruction *> processPhiNode(InstCombiner &IC,
383 SmallVector<Instruction *, 32> Worklist;
384 auto RequiredType = II.getType();
386 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
387 assert(PN && "Expected Phi Node!");
389 // Don't create a new Phi unless we can remove the old one.
390 if (!PN->hasOneUse())
393 for (Value *IncValPhi : PN->incoming_values()) {
394 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
396 Reinterpret->getIntrinsicID() !=
397 Intrinsic::aarch64_sve_convert_to_svbool ||
398 RequiredType != Reinterpret->getArgOperand(0)->getType())
402 // Create the new Phi
403 LLVMContext &Ctx = PN->getContext();
404 IRBuilder<> Builder(Ctx);
405 Builder.SetInsertPoint(PN);
406 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
407 Worklist.push_back(PN);
409 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
410 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
411 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
412 Worklist.push_back(Reinterpret);
415 // Cleanup Phi Node and reinterprets
416 return IC.replaceInstUsesWith(II, NPN);
419 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
420 // => (binop (pred) (from_svbool _) (from_svbool _))
422 // The above transformation eliminates a `to_svbool` in the predicate
423 // operand of bitwise operation `binop` by narrowing the vector width of
424 // the operation. For example, it would convert a `<vscale x 16 x i1>
425 // and` into a `<vscale x 4 x i1> and`. This is profitable because
426 // to_svbool must zero the new lanes during widening, whereas
427 // from_svbool is free.
428 static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC,
430 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
434 auto IntrinsicID = BinOp->getIntrinsicID();
435 switch (IntrinsicID) {
436 case Intrinsic::aarch64_sve_and_z:
437 case Intrinsic::aarch64_sve_bic_z:
438 case Intrinsic::aarch64_sve_eor_z:
439 case Intrinsic::aarch64_sve_nand_z:
440 case Intrinsic::aarch64_sve_nor_z:
441 case Intrinsic::aarch64_sve_orn_z:
442 case Intrinsic::aarch64_sve_orr_z:
448 auto BinOpPred = BinOp->getOperand(0);
449 auto BinOpOp1 = BinOp->getOperand(1);
450 auto BinOpOp2 = BinOp->getOperand(2);
452 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
454 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
457 auto PredOp = PredIntr->getOperand(0);
458 auto PredOpTy = cast<VectorType>(PredOp->getType());
459 if (PredOpTy != II.getType())
462 IRBuilder<> Builder(II.getContext());
463 Builder.SetInsertPoint(&II);
465 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
466 auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
467 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
468 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
469 if (BinOpOp1 == BinOpOp2)
470 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
472 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
473 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
476 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
477 return IC.replaceInstUsesWith(II, NarrowedBinOp);
480 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
482 // If the reinterpret instruction operand is a PHI Node
483 if (isa<PHINode>(II.getArgOperand(0)))
484 return processPhiNode(IC, II);
486 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
489 SmallVector<Instruction *, 32> CandidatesForRemoval;
490 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
492 const auto *IVTy = cast<VectorType>(II.getType());
494 // Walk the chain of conversions.
496 // If the type of the cursor has fewer lanes than the final result, zeroing
497 // must take place, which breaks the equivalence chain.
498 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
499 if (CursorVTy->getElementCount().getKnownMinValue() <
500 IVTy->getElementCount().getKnownMinValue())
503 // If the cursor has the same type as I, it is a viable replacement.
504 if (Cursor->getType() == IVTy)
505 EarliestReplacement = Cursor;
507 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
509 // If this is not an SVE conversion intrinsic, this is the end of the chain.
510 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
511 Intrinsic::aarch64_sve_convert_to_svbool ||
512 IntrinsicCursor->getIntrinsicID() ==
513 Intrinsic::aarch64_sve_convert_from_svbool))
516 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
517 Cursor = IntrinsicCursor->getOperand(0);
520 // If no viable replacement in the conversion chain was found, there is
522 if (!EarliestReplacement)
525 return IC.replaceInstUsesWith(II, EarliestReplacement);
528 static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
530 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
534 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
537 const auto PTruePattern =
538 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
539 if (PTruePattern != AArch64SVEPredPattern::vl1)
542 // The intrinsic is inserting into lane zero so use an insert instead.
543 auto *IdxTy = Type::getInt64Ty(II.getContext());
544 auto *Insert = InsertElementInst::Create(
545 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
546 Insert->insertBefore(&II);
547 Insert->takeName(&II);
549 return IC.replaceInstUsesWith(II, Insert);
552 static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
554 // Replace DupX with a regular IR splat.
555 IRBuilder<> Builder(II.getContext());
556 Builder.SetInsertPoint(&II);
557 auto *RetTy = cast<ScalableVectorType>(II.getType());
559 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
560 Splat->takeName(&II);
561 return IC.replaceInstUsesWith(II, Splat);
564 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
566 LLVMContext &Ctx = II.getContext();
567 IRBuilder<> Builder(Ctx);
568 Builder.SetInsertPoint(&II);
570 // Check that the predicate is all active
571 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
572 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
575 const auto PTruePattern =
576 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
577 if (PTruePattern != AArch64SVEPredPattern::all)
580 // Check that we have a compare of zero..
582 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
583 if (!SplatValue || !SplatValue->isZero())
587 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
589 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
592 // Where the dupq is a lane 0 replicate of a vector insert
593 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
596 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
598 VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
601 // Where the vector insert is a fixed constant vector insert into undef at
603 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
606 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
609 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
613 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
614 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
615 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
618 unsigned NumElts = VecTy->getNumElements();
619 unsigned PredicateBits = 0;
621 // Expand intrinsic operands to a 16-bit byte level predicate
622 for (unsigned I = 0; I < NumElts; ++I) {
623 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
627 PredicateBits |= 1 << (I * (16 / NumElts));
630 // If all bits are zero bail early with an empty predicate
631 if (PredicateBits == 0) {
632 auto *PFalse = Constant::getNullValue(II.getType());
633 PFalse->takeName(&II);
634 return IC.replaceInstUsesWith(II, PFalse);
637 // Calculate largest predicate type used (where byte predicate is largest)
639 for (unsigned I = 0; I < 16; ++I)
640 if ((PredicateBits & (1 << I)) != 0)
643 unsigned PredSize = Mask & -Mask;
644 auto *PredType = ScalableVectorType::get(
645 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
647 // Ensure all relevant bits are set
648 for (unsigned I = 0; I < 16; I += PredSize)
649 if ((PredicateBits & (1 << I)) == 0)
653 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
654 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
655 {PredType}, {PTruePat});
656 auto *ConvertToSVBool = Builder.CreateIntrinsic(
657 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
658 auto *ConvertFromSVBool =
659 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
660 {II.getType()}, {ConvertToSVBool});
662 ConvertFromSVBool->takeName(&II);
663 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
666 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
668 IRBuilder<> Builder(II.getContext());
669 Builder.SetInsertPoint(&II);
670 Value *Pg = II.getArgOperand(0);
671 Value *Vec = II.getArgOperand(1);
672 auto IntrinsicID = II.getIntrinsicID();
673 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
675 // lastX(splat(X)) --> X
676 if (auto *SplatVal = getSplatValue(Vec))
677 return IC.replaceInstUsesWith(II, SplatVal);
679 // If x and/or y is a splat value then:
680 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
682 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
683 if (isSplatValue(LHS) || isSplatValue(RHS)) {
684 auto *OldBinOp = cast<BinaryOperator>(Vec);
685 auto OpC = OldBinOp->getOpcode();
687 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
689 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
690 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
691 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
692 return IC.replaceInstUsesWith(II, NewBinOp);
696 auto *C = dyn_cast<Constant>(Pg);
697 if (IsAfter && C && C->isNullValue()) {
698 // The intrinsic is extracting lane 0 so use an extract instead.
699 auto *IdxTy = Type::getInt64Ty(II.getContext());
700 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
701 Extract->insertBefore(&II);
702 Extract->takeName(&II);
703 return IC.replaceInstUsesWith(II, Extract);
706 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
710 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
713 const auto PTruePattern =
714 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
716 // Can the intrinsic's predicate be converted to a known constant index?
717 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
721 unsigned Idx = MinNumElts - 1;
722 // Increment the index if extracting the element after the last active
723 // predicate element.
727 // Ignore extracts whose index is larger than the known minimum vector
728 // length. NOTE: This is an artificial constraint where we prefer to
729 // maintain what the user asked for until an alternative is proven faster.
730 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
731 if (Idx >= PgVTy->getMinNumElements())
734 // The intrinsic is extracting a fixed lane so use an extract instead.
735 auto *IdxTy = Type::getInt64Ty(II.getContext());
736 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
737 Extract->insertBefore(&II);
738 Extract->takeName(&II);
739 return IC.replaceInstUsesWith(II, Extract);
742 static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
744 LLVMContext &Ctx = II.getContext();
745 IRBuilder<> Builder(Ctx);
746 Builder.SetInsertPoint(&II);
747 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
748 // can work with RDFFR_PP for ptest elimination.
750 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
751 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
752 {II.getType()}, {AllPat});
754 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
755 RDFFR->takeName(&II);
756 return IC.replaceInstUsesWith(II, RDFFR);
759 static Optional<Instruction *>
760 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
761 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
763 if (Pattern == AArch64SVEPredPattern::all) {
764 LLVMContext &Ctx = II.getContext();
765 IRBuilder<> Builder(Ctx);
766 Builder.SetInsertPoint(&II);
768 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
769 auto *VScale = Builder.CreateVScale(StepVal);
770 VScale->takeName(&II);
771 return IC.replaceInstUsesWith(II, VScale);
774 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
776 return MinNumElts && NumElts >= MinNumElts
777 ? Optional<Instruction *>(IC.replaceInstUsesWith(
778 II, ConstantInt::get(II.getType(), MinNumElts)))
782 static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
784 IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
785 IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
788 Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
789 Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
790 Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
792 IRBuilder<> Builder(II.getContext());
793 Builder.SetInsertPoint(&II);
795 Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
796 Type *Tys[] = {Op1->getArgOperand(0)->getType()};
798 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
800 PTest->takeName(&II);
801 return IC.replaceInstUsesWith(II, PTest);
807 static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC,
809 // fold (fadd p a (fmul p b c)) -> (fma p a b c)
810 Value *P = II.getOperand(0);
811 Value *A = II.getOperand(1);
812 auto FMul = II.getOperand(2);
814 if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>(
815 m_Specific(P), m_Value(B), m_Value(C))))
818 if (!FMul->hasOneUse())
821 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
822 // Stop the combine when the flags on the inputs differ in case dropping flags
823 // would lead to us missing out on more beneficial optimizations.
824 if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags())
826 if (!FAddFlags.allowContract())
829 IRBuilder<> Builder(II.getContext());
830 Builder.SetInsertPoint(&II);
831 auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla,
832 {II.getType()}, {P, A, B, C}, &II);
833 FMLA->setFastMathFlags(FAddFlags);
834 return IC.replaceInstUsesWith(II, FMLA);
837 static bool isAllActivePredicate(Value *Pred) {
838 // Look through convert.from.svbool(convert.to.svbool(...) chain.
840 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
841 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
842 m_Value(UncastedPred)))))
843 // If the predicate has the same or less lanes than the uncasted
844 // predicate then we know the casting has no effect.
845 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
846 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
849 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
850 m_ConstantInt<AArch64SVEPredPattern::all>()));
853 static Optional<Instruction *>
854 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
855 IRBuilder<> Builder(II.getContext());
856 Builder.SetInsertPoint(&II);
858 Value *Pred = II.getOperand(0);
859 Value *PtrOp = II.getOperand(1);
860 Type *VecTy = II.getType();
861 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
863 if (isAllActivePredicate(Pred)) {
864 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
865 return IC.replaceInstUsesWith(II, Load);
868 CallInst *MaskedLoad =
869 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
870 Pred, ConstantAggregateZero::get(VecTy));
871 return IC.replaceInstUsesWith(II, MaskedLoad);
874 static Optional<Instruction *>
875 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
876 IRBuilder<> Builder(II.getContext());
877 Builder.SetInsertPoint(&II);
879 Value *VecOp = II.getOperand(0);
880 Value *Pred = II.getOperand(1);
881 Value *PtrOp = II.getOperand(2);
883 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
885 if (isAllActivePredicate(Pred)) {
886 Builder.CreateStore(VecOp, VecPtr);
887 return IC.eraseInstFromFunction(II);
890 Builder.CreateMaskedStore(VecOp, VecPtr, PtrOp->getPointerAlignment(DL),
892 return IC.eraseInstFromFunction(II);
895 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
897 case Intrinsic::aarch64_sve_fmul:
898 return Instruction::BinaryOps::FMul;
899 case Intrinsic::aarch64_sve_fadd:
900 return Instruction::BinaryOps::FAdd;
901 case Intrinsic::aarch64_sve_fsub:
902 return Instruction::BinaryOps::FSub;
904 return Instruction::BinaryOpsEnd;
908 static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC,
910 auto *OpPredicate = II.getOperand(0);
911 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
912 if (BinOpCode == Instruction::BinaryOpsEnd ||
913 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
914 m_ConstantInt<AArch64SVEPredPattern::all>())))
916 IRBuilder<> Builder(II.getContext());
917 Builder.SetInsertPoint(&II);
918 Builder.setFastMathFlags(II.getFastMathFlags());
920 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
921 return IC.replaceInstUsesWith(II, BinOp);
924 static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC,
926 if (auto FMLA = instCombineSVEVectorFMLA(IC, II))
928 return instCombineSVEVectorBinOp(IC, II);
931 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
933 auto *OpPredicate = II.getOperand(0);
934 auto *OpMultiplicand = II.getOperand(1);
935 auto *OpMultiplier = II.getOperand(2);
937 IRBuilder<> Builder(II.getContext());
938 Builder.SetInsertPoint(&II);
940 // Return true if a given instruction is a unit splat value, false otherwise.
941 auto IsUnitSplat = [](auto *I) {
942 auto *SplatValue = getSplatValue(I);
945 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
948 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
949 // with a unit splat value, false otherwise.
950 auto IsUnitDup = [](auto *I) {
951 auto *IntrI = dyn_cast<IntrinsicInst>(I);
952 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
955 auto *SplatValue = IntrI->getOperand(2);
956 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
959 if (IsUnitSplat(OpMultiplier)) {
960 // [f]mul pg %n, (dupx 1) => %n
961 OpMultiplicand->takeName(&II);
962 return IC.replaceInstUsesWith(II, OpMultiplicand);
963 } else if (IsUnitDup(OpMultiplier)) {
964 // [f]mul pg %n, (dup pg 1) => %n
965 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
966 auto *DupPg = DupInst->getOperand(1);
967 // TODO: this is naive. The optimization is still valid if DupPg
968 // 'encompasses' OpPredicate, not only if they're the same predicate.
969 if (OpPredicate == DupPg) {
970 OpMultiplicand->takeName(&II);
971 return IC.replaceInstUsesWith(II, OpMultiplicand);
975 return instCombineSVEVectorBinOp(IC, II);
978 static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
980 IRBuilder<> Builder(II.getContext());
981 Builder.SetInsertPoint(&II);
982 Value *UnpackArg = II.getArgOperand(0);
983 auto *RetTy = cast<ScalableVectorType>(II.getType());
984 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
985 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
987 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
988 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
989 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
991 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
993 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
994 NewVal->takeName(&II);
995 return IC.replaceInstUsesWith(II, NewVal);
1000 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1001 IntrinsicInst &II) {
1002 auto *OpVal = II.getOperand(0);
1003 auto *OpIndices = II.getOperand(1);
1004 VectorType *VTy = cast<VectorType>(II.getType());
1006 // Check whether OpIndices is a constant splat value < minimal element count
1008 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1010 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1013 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1014 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1015 IRBuilder<> Builder(II.getContext());
1016 Builder.SetInsertPoint(&II);
1017 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
1019 Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1021 VectorSplat->takeName(&II);
1022 return IC.replaceInstUsesWith(II, VectorSplat);
1025 static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC,
1026 IntrinsicInst &II) {
1027 // Try to remove sequences of tuple get/set.
1028 Value *SetTuple, *SetIndex, *SetValue;
1029 auto *GetTuple = II.getArgOperand(0);
1030 auto *GetIndex = II.getArgOperand(1);
1031 // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a
1032 // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue).
1033 // Make sure that the types of the current intrinsic and SetValue match
1034 // in order to safely remove the sequence.
1035 if (!match(GetTuple,
1036 m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>(
1037 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) ||
1038 SetValue->getType() != II.getType())
1040 // Case where we get the same index right after setting it.
1041 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue
1042 if (GetIndex == SetIndex)
1043 return IC.replaceInstUsesWith(II, SetValue);
1044 // If we are getting a different index than what was set in the tuple_set
1045 // intrinsic. We can just set the input tuple to the one up in the chain.
1046 // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex)
1047 // --> tuple_get(SetTuple, GetIndex)
1048 return IC.replaceOperand(II, 0, SetTuple);
1051 static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1052 IntrinsicInst &II) {
1053 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1054 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1056 if (match(II.getArgOperand(0),
1057 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1058 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1059 m_Specific(A), m_Specific(B))))
1060 return IC.replaceInstUsesWith(
1061 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1066 static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
1067 IntrinsicInst &II) {
1068 Value *Mask = II.getOperand(0);
1069 Value *BasePtr = II.getOperand(1);
1070 Value *Index = II.getOperand(2);
1071 Type *Ty = II.getType();
1072 Type *BasePtrTy = BasePtr->getType();
1073 Value *PassThru = ConstantAggregateZero::get(Ty);
1075 // Contiguous gather => masked load.
1076 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1077 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1079 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1080 m_Value(IndexBase), m_SpecificInt(1)))) {
1081 IRBuilder<> Builder(II.getContext());
1082 Builder.SetInsertPoint(&II);
1085 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1087 Type *VecPtrTy = PointerType::getUnqual(Ty);
1088 Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
1090 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1091 CallInst *MaskedLoad =
1092 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1093 MaskedLoad->takeName(&II);
1094 return IC.replaceInstUsesWith(II, MaskedLoad);
1100 static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
1101 IntrinsicInst &II) {
1102 Value *Val = II.getOperand(0);
1103 Value *Mask = II.getOperand(1);
1104 Value *BasePtr = II.getOperand(2);
1105 Value *Index = II.getOperand(3);
1106 Type *Ty = Val->getType();
1107 Type *BasePtrTy = BasePtr->getType();
1109 // Contiguous scatter => masked store.
1110 // (sve.ld1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1111 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1113 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1114 m_Value(IndexBase), m_SpecificInt(1)))) {
1115 IRBuilder<> Builder(II.getContext());
1116 Builder.SetInsertPoint(&II);
1119 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1121 Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
1123 Type *VecPtrTy = PointerType::getUnqual(Ty);
1124 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1126 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1128 return IC.eraseInstFromFunction(II);
1134 static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1135 IntrinsicInst &II) {
1136 IRBuilder<> Builder(II.getContext());
1137 Builder.SetInsertPoint(&II);
1138 Type *Int32Ty = Builder.getInt32Ty();
1139 Value *Pred = II.getOperand(0);
1140 Value *Vec = II.getOperand(1);
1141 Value *DivVec = II.getOperand(2);
1143 Value *SplatValue = getSplatValue(DivVec);
1144 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1145 if (!SplatConstantInt)
1147 APInt Divisor = SplatConstantInt->getValue();
1149 if (Divisor.isPowerOf2()) {
1150 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1151 auto ASRD = Builder.CreateIntrinsic(
1152 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1153 return IC.replaceInstUsesWith(II, ASRD);
1155 if (Divisor.isNegatedPowerOf2()) {
1157 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1158 auto ASRD = Builder.CreateIntrinsic(
1159 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1160 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1161 {ASRD->getType()}, {ASRD, Pred, ASRD});
1162 return IC.replaceInstUsesWith(II, NEG);
1168 Optional<Instruction *>
1169 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
1170 IntrinsicInst &II) const {
1171 Intrinsic::ID IID = II.getIntrinsicID();
1175 case Intrinsic::aarch64_sve_convert_from_svbool:
1176 return instCombineConvertFromSVBool(IC, II);
1177 case Intrinsic::aarch64_sve_dup:
1178 return instCombineSVEDup(IC, II);
1179 case Intrinsic::aarch64_sve_dup_x:
1180 return instCombineSVEDupX(IC, II);
1181 case Intrinsic::aarch64_sve_cmpne:
1182 case Intrinsic::aarch64_sve_cmpne_wide:
1183 return instCombineSVECmpNE(IC, II);
1184 case Intrinsic::aarch64_sve_rdffr:
1185 return instCombineRDFFR(IC, II);
1186 case Intrinsic::aarch64_sve_lasta:
1187 case Intrinsic::aarch64_sve_lastb:
1188 return instCombineSVELast(IC, II);
1189 case Intrinsic::aarch64_sve_cntd:
1190 return instCombineSVECntElts(IC, II, 2);
1191 case Intrinsic::aarch64_sve_cntw:
1192 return instCombineSVECntElts(IC, II, 4);
1193 case Intrinsic::aarch64_sve_cnth:
1194 return instCombineSVECntElts(IC, II, 8);
1195 case Intrinsic::aarch64_sve_cntb:
1196 return instCombineSVECntElts(IC, II, 16);
1197 case Intrinsic::aarch64_sve_ptest_any:
1198 case Intrinsic::aarch64_sve_ptest_first:
1199 case Intrinsic::aarch64_sve_ptest_last:
1200 return instCombineSVEPTest(IC, II);
1201 case Intrinsic::aarch64_sve_mul:
1202 case Intrinsic::aarch64_sve_fmul:
1203 return instCombineSVEVectorMul(IC, II);
1204 case Intrinsic::aarch64_sve_fadd:
1205 return instCombineSVEVectorFAdd(IC, II);
1206 case Intrinsic::aarch64_sve_fsub:
1207 return instCombineSVEVectorBinOp(IC, II);
1208 case Intrinsic::aarch64_sve_tbl:
1209 return instCombineSVETBL(IC, II);
1210 case Intrinsic::aarch64_sve_uunpkhi:
1211 case Intrinsic::aarch64_sve_uunpklo:
1212 case Intrinsic::aarch64_sve_sunpkhi:
1213 case Intrinsic::aarch64_sve_sunpklo:
1214 return instCombineSVEUnpack(IC, II);
1215 case Intrinsic::aarch64_sve_tuple_get:
1216 return instCombineSVETupleGet(IC, II);
1217 case Intrinsic::aarch64_sve_zip1:
1218 case Intrinsic::aarch64_sve_zip2:
1219 return instCombineSVEZip(IC, II);
1220 case Intrinsic::aarch64_sve_ld1_gather_index:
1221 return instCombineLD1GatherIndex(IC, II);
1222 case Intrinsic::aarch64_sve_st1_scatter_index:
1223 return instCombineST1ScatterIndex(IC, II);
1224 case Intrinsic::aarch64_sve_ld1:
1225 return instCombineSVELD1(IC, II, DL);
1226 case Intrinsic::aarch64_sve_st1:
1227 return instCombineSVEST1(IC, II, DL);
1228 case Intrinsic::aarch64_sve_sdiv:
1229 return instCombineSVESDIV(IC, II);
1235 Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1236 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1237 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1238 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1239 SimplifyAndSetOp) const {
1240 switch (II.getIntrinsicID()) {
1243 case Intrinsic::aarch64_neon_fcvtxn:
1244 case Intrinsic::aarch64_neon_rshrn:
1245 case Intrinsic::aarch64_neon_sqrshrn:
1246 case Intrinsic::aarch64_neon_sqrshrun:
1247 case Intrinsic::aarch64_neon_sqshrn:
1248 case Intrinsic::aarch64_neon_sqshrun:
1249 case Intrinsic::aarch64_neon_sqxtn:
1250 case Intrinsic::aarch64_neon_sqxtun:
1251 case Intrinsic::aarch64_neon_uqrshrn:
1252 case Intrinsic::aarch64_neon_uqshrn:
1253 case Intrinsic::aarch64_neon_uqxtn:
1254 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1261 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1262 ArrayRef<const Value *> Args) {
1264 // A helper that returns a vector type from the given type. The number of
1265 // elements in type Ty determine the vector width.
1266 auto toVectorTy = [&](Type *ArgTy) {
1267 return VectorType::get(ArgTy->getScalarType(),
1268 cast<VectorType>(DstTy)->getElementCount());
1271 // Exit early if DstTy is not a vector type whose elements are at least
1273 if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
1276 // Determine if the operation has a widening variant. We consider both the
1277 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1280 // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
1281 // verify that their extending operands are eliminated during code
1284 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1285 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1291 // To be a widening instruction (either the "wide" or "long" versions), the
1292 // second operand must be a sign- or zero extend having a single user. We
1293 // only consider extends having a single user because they may otherwise not
1295 if (Args.size() != 2 ||
1296 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
1297 !Args[1]->hasOneUse())
1299 auto *Extend = cast<CastInst>(Args[1]);
1301 // Legalize the destination type and ensure it can be used in a widening
1303 auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
1304 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1305 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1308 // Legalize the source type and ensure it can be used in a widening
1310 auto *SrcTy = toVectorTy(Extend->getSrcTy());
1311 auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
1312 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1313 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1316 // Get the total number of vector elements in the legalized types.
1317 InstructionCost NumDstEls =
1318 DstTyL.first * DstTyL.second.getVectorMinNumElements();
1319 InstructionCost NumSrcEls =
1320 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1322 // Return true if the legalized types have the same number of vector elements
1323 // and the destination element type size is twice that of the source type.
1324 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1327 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1329 TTI::CastContextHint CCH,
1330 TTI::TargetCostKind CostKind,
1331 const Instruction *I) {
1332 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1333 assert(ISD && "Invalid opcode");
1335 // If the cast is observable, and it is used by a widening instruction (e.g.,
1336 // uaddl, saddw, etc.), it may be free.
1337 if (I && I->hasOneUse()) {
1338 auto *SingleUser = cast<Instruction>(*I->user_begin());
1339 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1340 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1341 // If the cast is the second operand, it is free. We will generate either
1342 // a "wide" or "long" version of the widening instruction.
1343 if (I == SingleUser->getOperand(1))
1345 // If the cast is not the second operand, it will be free if it looks the
1346 // same as the second operand. In this case, we will generate a "long"
1347 // version of the widening instruction.
1348 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1349 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1350 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1355 // TODO: Allow non-throughput costs that aren't binary.
1356 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1357 if (CostKind != TTI::TCK_RecipThroughput)
1358 return Cost == 0 ? 0 : 1;
1362 EVT SrcTy = TLI->getValueType(DL, Src);
1363 EVT DstTy = TLI->getValueType(DL, Dst);
1365 if (!SrcTy.isSimple() || !DstTy.isSimple())
1367 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1369 static const TypeConversionCostTblEntry
1371 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
1372 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
1373 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
1374 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
1376 // Truncations on nxvmiN
1377 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
1378 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
1379 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
1380 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
1381 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
1382 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
1383 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
1384 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
1385 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
1386 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
1387 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
1388 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
1389 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
1390 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
1391 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
1392 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
1394 // The number of shll instructions for the extension.
1395 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1396 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
1397 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1398 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
1399 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1400 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
1401 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1402 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
1403 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1404 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
1405 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1406 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
1407 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1408 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1409 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1410 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1412 // LowerVectorINT_TO_FP:
1413 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1414 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1415 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1416 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1417 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1418 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1420 // Complex: to v2f32
1421 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1422 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1423 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1424 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
1425 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1426 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1428 // Complex: to v4f32
1429 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
1430 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1431 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
1432 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1434 // Complex: to v8f32
1435 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1436 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1437 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
1438 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1440 // Complex: to v16f32
1441 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1442 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1444 // Complex: to v2f64
1445 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1446 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1447 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1448 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
1449 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1450 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1453 // LowerVectorFP_TO_INT
1454 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
1455 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
1456 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1457 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1458 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1459 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1461 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1462 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
1463 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
1464 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
1465 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
1466 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
1467 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
1469 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1470 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
1471 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
1472 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
1473 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
1475 // Complex, from nxv2f32.
1476 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1477 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1478 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1479 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1480 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1481 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1482 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1483 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
1485 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1486 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
1487 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
1488 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
1489 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
1490 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
1491 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
1493 // Complex, from nxv2f64.
1494 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1495 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1496 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1497 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1498 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1499 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1500 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1501 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
1503 // Complex, from nxv4f32.
1504 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1505 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1506 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1507 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1508 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1509 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1510 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1511 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
1513 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1514 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1515 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1516 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1517 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
1519 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1520 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1521 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1522 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1523 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1524 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1525 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
1527 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
1528 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1529 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1530 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
1531 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
1533 // Complex, from nxv8f16.
1534 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1535 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1536 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1537 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1538 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
1539 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
1540 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
1541 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
1543 // Complex, from nxv4f16.
1544 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1545 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1546 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1547 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1548 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
1549 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
1550 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
1551 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
1553 // Complex, from nxv2f16.
1554 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1555 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1556 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1557 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1558 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
1559 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
1560 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
1561 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
1563 // Truncate from nxvmf32 to nxvmf16.
1564 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
1565 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
1566 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
1568 // Truncate from nxvmf64 to nxvmf16.
1569 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
1570 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
1571 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
1573 // Truncate from nxvmf64 to nxvmf32.
1574 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
1575 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
1576 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
1578 // Extend from nxvmf16 to nxvmf32.
1579 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
1580 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
1581 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
1583 // Extend from nxvmf16 to nxvmf64.
1584 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
1585 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
1586 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
1588 // Extend from nxvmf32 to nxvmf64.
1589 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
1590 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
1591 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
1593 // Bitcasts from float to integer
1594 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
1595 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
1596 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
1598 // Bitcasts from integer to float
1599 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
1600 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
1601 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
1604 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1605 DstTy.getSimpleVT(),
1606 SrcTy.getSimpleVT()))
1607 return AdjustCost(Entry->Cost);
1610 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1613 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
1618 // Make sure we were given a valid extend opcode.
1619 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
1622 // We are extending an element we extract from a vector, so the source type
1623 // of the extend is the element type of the vector.
1624 auto *Src = VecTy->getElementType();
1626 // Sign- and zero-extends are for integer types only.
1627 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
1629 // Get the cost for the extract. We compute the cost (if any) for the extend
1631 InstructionCost Cost =
1632 getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1634 // Legalize the types.
1635 auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
1636 auto DstVT = TLI->getValueType(DL, Dst);
1637 auto SrcVT = TLI->getValueType(DL, Src);
1638 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1640 // If the resulting type is still a vector and the destination type is legal,
1641 // we may get the extension for free. If not, get the default cost for the
1643 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1644 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1647 // The destination type should be larger than the element type. If not, get
1648 // the default cost for the extend.
1649 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
1650 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1655 llvm_unreachable("Opcode should be either SExt or ZExt");
1657 // For sign-extends, we only need a smov, which performs the extension
1659 case Instruction::SExt:
1662 // For zero-extends, the extend is performed automatically by a umov unless
1663 // the destination type is i64 and the element type is i8 or i16.
1664 case Instruction::ZExt:
1665 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
1669 // If we are unable to perform the extend for free, get the default cost.
1670 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1674 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
1675 TTI::TargetCostKind CostKind,
1676 const Instruction *I) {
1677 if (CostKind != TTI::TCK_RecipThroughput)
1678 return Opcode == Instruction::PHI ? 0 : 1;
1679 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
1680 // Branches are assumed to be predicted.
1684 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1686 assert(Val->isVectorTy() && "This must be a vector type");
1689 // Legalize the type.
1690 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1692 // This type is legalized to a scalar type.
1693 if (!LT.second.isVector())
1696 // The type may be split. For fixed-width vectors we can normalize the
1697 // index to the new type.
1698 if (LT.second.isFixedLengthVector()) {
1699 unsigned Width = LT.second.getVectorNumElements();
1700 Index = Index % Width;
1703 // The element at index zero is already inside the vector.
1708 // All other insert/extracts cost this much.
1709 return ST->getVectorInsertExtractBaseCost();
1712 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
1713 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1714 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
1715 TTI::OperandValueProperties Opd1PropInfo,
1716 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1717 const Instruction *CxtI) {
1718 // TODO: Handle more cost kinds.
1719 if (CostKind != TTI::TCK_RecipThroughput)
1720 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1721 Opd2Info, Opd1PropInfo,
1722 Opd2PropInfo, Args, CxtI);
1724 // Legalize the type.
1725 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1727 // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
1728 // add in the widening overhead specified by the sub-target. Since the
1729 // extends feeding widening instructions are performed automatically, they
1730 // aren't present in the generated code and have a zero cost. By adding a
1731 // widening overhead here, we attach the total cost of the combined operation
1732 // to the widening instruction.
1733 InstructionCost Cost = 0;
1734 if (isWideningInstruction(Ty, Opcode, Args))
1735 Cost += ST->getWideningBaseCost();
1737 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1741 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1743 Opd1PropInfo, Opd2PropInfo);
1745 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
1746 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
1747 // On AArch64, scalar signed division by constants power-of-two are
1748 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
1749 // The OperandValue properties many not be same as that of previous
1750 // operation; conservatively assume OP_None.
1751 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
1753 TargetTransformInfo::OP_None,
1754 TargetTransformInfo::OP_None);
1755 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
1757 TargetTransformInfo::OP_None,
1758 TargetTransformInfo::OP_None);
1759 Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
1761 TargetTransformInfo::OP_None,
1762 TargetTransformInfo::OP_None);
1763 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
1765 TargetTransformInfo::OP_None,
1766 TargetTransformInfo::OP_None);
1771 if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
1772 auto VT = TLI->getValueType(DL, Ty);
1773 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
1774 // Vector signed division by constant are expanded to the
1775 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
1776 // to MULHS + SUB + SRL + ADD + SRL.
1777 InstructionCost MulCost = getArithmeticInstrCost(
1778 Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
1779 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1780 InstructionCost AddCost = getArithmeticInstrCost(
1781 Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1782 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1783 InstructionCost ShrCost = getArithmeticInstrCost(
1784 Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
1785 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
1786 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
1790 Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1792 Opd1PropInfo, Opd2PropInfo);
1793 if (Ty->isVectorTy()) {
1794 // On AArch64, vector divisions are not supported natively and are
1795 // expanded into scalar divisions of each pair of elements.
1796 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
1797 Opd1Info, Opd2Info, Opd1PropInfo,
1799 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
1800 Opd1Info, Opd2Info, Opd1PropInfo,
1802 // TODO: if one of the arguments is scalar, then it's not necessary to
1803 // double the cost of handling the vector elements.
1809 if (LT.second != MVT::v2i64)
1810 return (Cost + 1) * LT.first;
1811 // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
1812 // as elements are extracted from the vectors and the muls scalarized.
1813 // As getScalarizationOverhead is a bit too pessimistic, we estimate the
1814 // cost for a i64 vector directly here, which is:
1815 // - four i64 extracts,
1816 // - two i64 inserts, and
1818 // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
1819 // LT.first = 2 the cost is 16.
1820 return LT.first * 8;
1825 // These nodes are marked as 'custom' for combining purposes only.
1826 // We know that they are legal. See LowerAdd in ISelLowering.
1827 return (Cost + 1) * LT.first;
1834 // These nodes are marked as 'custom' just to lower them to SVE.
1835 // We know said lowering will incur no additional cost.
1836 if (!Ty->getScalarType()->isFP128Ty())
1837 return (Cost + 2) * LT.first;
1839 return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1841 Opd1PropInfo, Opd2PropInfo);
1845 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
1846 ScalarEvolution *SE,
1848 // Address computations in vectorized code with non-consecutive addresses will
1849 // likely result in more instructions compared to scalar code where the
1850 // computation can more often be merged into the index mode. The resulting
1851 // extra micro-ops can significantly decrease throughput.
1852 unsigned NumVectorInstToHideOverhead = 10;
1853 int MaxMergeDistance = 64;
1855 if (Ty->isVectorTy() && SE &&
1856 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1857 return NumVectorInstToHideOverhead;
1859 // In many cases the address computation is not merged into the instruction
1864 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1866 CmpInst::Predicate VecPred,
1867 TTI::TargetCostKind CostKind,
1868 const Instruction *I) {
1869 // TODO: Handle other cost kinds.
1870 if (CostKind != TTI::TCK_RecipThroughput)
1871 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1874 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1875 // We don't lower some vector selects well that are wider than the register
1877 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
1878 // We would need this many instructions to hide the scalarization happening.
1879 const int AmortizationCost = 20;
1881 // If VecPred is not set, check if we can get a predicate from the context
1882 // instruction, if its type matches the requested ValTy.
1883 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
1884 CmpInst::Predicate CurrentPred;
1885 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
1887 VecPred = CurrentPred;
1889 // Check if we have a compare/select chain that can be lowered using
1890 // a (F)CMxx & BFI pair.
1891 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
1892 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
1893 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
1894 VecPred == CmpInst::FCMP_UNE) {
1895 static const auto ValidMinMaxTys = {
1896 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1897 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
1898 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
1900 auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
1901 if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) ||
1902 (ST->hasFullFP16() &&
1903 any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; })))
1907 static const TypeConversionCostTblEntry
1908 VectorSelectTbl[] = {
1909 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
1910 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
1911 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
1912 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
1913 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
1914 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
1917 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1918 EVT SelValTy = TLI->getValueType(DL, ValTy);
1919 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1920 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
1921 SelCondTy.getSimpleVT(),
1922 SelValTy.getSimpleVT()))
1926 // The base case handles scalable vectors fine for now, since it treats the
1927 // cost as 1 * legalization cost.
1928 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1931 AArch64TTIImpl::TTI::MemCmpExpansionOptions
1932 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
1933 TTI::MemCmpExpansionOptions Options;
1934 if (ST->requiresStrictAlign()) {
1935 // TODO: Add cost modeling for strict align. Misaligned loads expand to
1936 // a bunch of instructions when strict align is enabled.
1939 Options.AllowOverlappingLoads = true;
1940 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
1941 Options.NumLoadsPerBlock = Options.MaxNumLoads;
1942 // TODO: Though vector loads usually perform well on AArch64, in some targets
1943 // they may wake up the FP unit, which raises the power consumption. Perhaps
1944 // they could be used with no holds barred (-O3).
1945 Options.LoadSizes = {8, 4, 2, 1};
1950 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1951 Align Alignment, unsigned AddressSpace,
1952 TTI::TargetCostKind CostKind) {
1953 if (useNeonVector(Src))
1954 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1956 auto LT = TLI->getTypeLegalizationCost(DL, Src);
1957 if (!LT.first.isValid())
1958 return InstructionCost::getInvalid();
1960 // The code-generator is currently not able to handle scalable vectors
1961 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1962 // it. This change will be removed when code-generation for these types is
1963 // sufficiently reliable.
1964 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
1965 return InstructionCost::getInvalid();
1967 return LT.first * 2;
1970 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
1971 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
1974 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
1975 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1976 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1977 if (useNeonVector(DataTy))
1978 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1979 Alignment, CostKind, I);
1980 auto *VT = cast<VectorType>(DataTy);
1981 auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
1982 if (!LT.first.isValid())
1983 return InstructionCost::getInvalid();
1985 // The code-generator is currently not able to handle scalable vectors
1986 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1987 // it. This change will be removed when code-generation for these types is
1988 // sufficiently reliable.
1989 if (cast<VectorType>(DataTy)->getElementCount() ==
1990 ElementCount::getScalable(1))
1991 return InstructionCost::getInvalid();
1993 ElementCount LegalVF = LT.second.getVectorElementCount();
1994 InstructionCost MemOpCost =
1995 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
1996 // Add on an overhead cost for using gathers/scatters.
1997 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
1998 // point we may want a per-CPU overhead.
1999 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2000 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2003 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
2004 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2007 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
2008 MaybeAlign Alignment,
2009 unsigned AddressSpace,
2010 TTI::TargetCostKind CostKind,
2011 const Instruction *I) {
2012 EVT VT = TLI->getValueType(DL, Ty, true);
2013 // Type legalization can't handle structs
2014 if (VT == MVT::Other)
2015 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2018 auto LT = TLI->getTypeLegalizationCost(DL, Ty);
2019 if (!LT.first.isValid())
2020 return InstructionCost::getInvalid();
2022 // The code-generator is currently not able to handle scalable vectors
2023 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2024 // it. This change will be removed when code-generation for these types is
2025 // sufficiently reliable.
2026 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
2027 if (VTy->getElementCount() == ElementCount::getScalable(1))
2028 return InstructionCost::getInvalid();
2030 // TODO: consider latency as well for TCK_SizeAndLatency.
2031 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
2034 if (CostKind != TTI::TCK_RecipThroughput)
2037 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
2038 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
2039 // Unaligned stores are extremely inefficient. We don't split all
2040 // unaligned 128-bit stores because the negative impact that has shown in
2041 // practice on inlined block copy code.
2042 // We make such stores expensive so that we will only vectorize if there
2043 // are 6 other instructions getting vectorized.
2044 const int AmortizationCost = 6;
2046 return LT.first * 2 * AmortizationCost;
2049 // Check truncating stores and extending loads.
2050 if (useNeonVector(Ty) &&
2051 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
2052 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
2053 if (VT == MVT::v4i8)
2055 // Otherwise we need to scalarize.
2056 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
2062 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
2063 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2064 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2065 bool UseMaskForCond, bool UseMaskForGaps) {
2066 assert(Factor >= 2 && "Invalid interleave factor");
2067 auto *VecVTy = cast<FixedVectorType>(VecTy);
2069 if (!UseMaskForCond && !UseMaskForGaps &&
2070 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
2071 unsigned NumElts = VecVTy->getNumElements();
2073 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
2075 // ldN/stN only support legal vector types of size 64 or 128 in bits.
2076 // Accesses having vector types that are a multiple of 128 bits can be
2077 // matched to more than one ldN/stN instruction.
2079 if (NumElts % Factor == 0 &&
2080 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2081 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2084 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2085 Alignment, AddressSpace, CostKind,
2086 UseMaskForCond, UseMaskForGaps);
2090 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
2091 InstructionCost Cost = 0;
2092 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2093 for (auto *I : Tys) {
2094 if (!I->isVectorTy())
2096 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2098 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2099 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2104 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
2105 return ST->getMaxInterleaveFactor();
2108 // For Falkor, we want to avoid having too many strided loads in a loop since
2109 // that can exhaust the HW prefetcher resources. We adjust the unroller
2110 // MaxCount preference below to attempt to ensure unrolling doesn't create too
2111 // many strided loads.
2113 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2114 TargetTransformInfo::UnrollingPreferences &UP) {
2115 enum { MaxStridedLoads = 7 };
2116 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2117 int StridedLoads = 0;
2118 // FIXME? We could make this more precise by looking at the CFG and
2119 // e.g. not counting loads in each side of an if-then-else diamond.
2120 for (const auto BB : L->blocks()) {
2121 for (auto &I : *BB) {
2122 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2126 Value *PtrValue = LMemI->getPointerOperand();
2127 if (L->isLoopInvariant(PtrValue))
2130 const SCEV *LSCEV = SE.getSCEV(PtrValue);
2131 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2132 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2135 // FIXME? We could take pairing of unrolled load copies into account
2136 // by looking at the AddRec, but we would probably have to limit this
2137 // to loops with no stores or other memory optimization barriers.
2139 // We've seen enough strided loads that seeing more won't make a
2141 if (StridedLoads > MaxStridedLoads / 2)
2142 return StridedLoads;
2145 return StridedLoads;
2148 int StridedLoads = countStridedLoads(L, SE);
2149 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
2150 << " strided loads\n");
2151 // Pick the largest power of 2 unroll count that won't result in too many
2154 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
2155 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
2156 << UP.MaxCount << '\n');
2160 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2161 TTI::UnrollingPreferences &UP,
2162 OptimizationRemarkEmitter *ORE) {
2163 // Enable partial unrolling and runtime unrolling.
2164 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2166 UP.UpperBound = true;
2168 // For inner loop, it is more likely to be a hot one, and the runtime check
2169 // can be promoted out from LICM pass, so the overhead is less, let's try
2170 // a larger threshold to unroll more loops.
2171 if (L->getLoopDepth() > 1)
2172 UP.PartialThreshold *= 2;
2174 // Disable partial & runtime unrolling on -Os.
2175 UP.PartialOptSizeThreshold = 0;
2177 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
2178 EnableFalkorHWPFUnrollFix)
2179 getFalkorUnrollingPreferences(L, SE, UP);
2181 // Scan the loop: don't unroll loops with calls as this could prevent
2182 // inlining. Don't unroll vector loops either, as they don't benefit much from
2184 for (auto *BB : L->getBlocks()) {
2185 for (auto &I : *BB) {
2186 // Don't unroll vectorised loop.
2187 if (I.getType()->isVectorTy())
2190 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2191 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2192 if (!isLoweredToCall(F))
2200 // Enable runtime unrolling for in-order models
2201 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2202 // checking for that case, we can ensure that the default behaviour is
2204 if (ST->getProcFamily() != AArch64Subtarget::Others &&
2205 !ST->getSchedModel().isOutOfOrder()) {
2208 UP.UnrollRemainder = true;
2209 UP.DefaultUnrollRuntimeCount = 4;
2211 UP.UnrollAndJam = true;
2212 UP.UnrollAndJamInnerLoopThreshold = 60;
2216 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2217 TTI::PeelingPreferences &PP) {
2218 BaseT::getPeelingPreferences(L, SE, PP);
2221 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
2222 Type *ExpectedType) {
2223 switch (Inst->getIntrinsicID()) {
2226 case Intrinsic::aarch64_neon_st2:
2227 case Intrinsic::aarch64_neon_st3:
2228 case Intrinsic::aarch64_neon_st4: {
2229 // Create a struct type
2230 StructType *ST = dyn_cast<StructType>(ExpectedType);
2233 unsigned NumElts = Inst->arg_size() - 1;
2234 if (ST->getNumElements() != NumElts)
2236 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2237 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2240 Value *Res = UndefValue::get(ExpectedType);
2241 IRBuilder<> Builder(Inst);
2242 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2243 Value *L = Inst->getArgOperand(i);
2244 Res = Builder.CreateInsertValue(Res, L, i);
2248 case Intrinsic::aarch64_neon_ld2:
2249 case Intrinsic::aarch64_neon_ld3:
2250 case Intrinsic::aarch64_neon_ld4:
2251 if (Inst->getType() == ExpectedType)
2257 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
2258 MemIntrinsicInfo &Info) {
2259 switch (Inst->getIntrinsicID()) {
2262 case Intrinsic::aarch64_neon_ld2:
2263 case Intrinsic::aarch64_neon_ld3:
2264 case Intrinsic::aarch64_neon_ld4:
2265 Info.ReadMem = true;
2266 Info.WriteMem = false;
2267 Info.PtrVal = Inst->getArgOperand(0);
2269 case Intrinsic::aarch64_neon_st2:
2270 case Intrinsic::aarch64_neon_st3:
2271 case Intrinsic::aarch64_neon_st4:
2272 Info.ReadMem = false;
2273 Info.WriteMem = true;
2274 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2278 switch (Inst->getIntrinsicID()) {
2281 case Intrinsic::aarch64_neon_ld2:
2282 case Intrinsic::aarch64_neon_st2:
2283 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2285 case Intrinsic::aarch64_neon_ld3:
2286 case Intrinsic::aarch64_neon_st3:
2287 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2289 case Intrinsic::aarch64_neon_ld4:
2290 case Intrinsic::aarch64_neon_st4:
2291 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2297 /// See if \p I should be considered for address type promotion. We check if \p
2298 /// I is a sext with right type and used in memory accesses. If it used in a
2299 /// "complex" getelementptr, we allow it to be promoted without finding other
2300 /// sext instructions that sign extended the same initial value. A getelementptr
2301 /// is considered as "complex" if it has more than 2 operands.
2302 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
2303 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2304 bool Considerable = false;
2305 AllowPromotionWithoutCommonHeader = false;
2306 if (!isa<SExtInst>(&I))
2308 Type *ConsideredSExtType =
2309 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2310 if (I.getType() != ConsideredSExtType)
2312 // See if the sext is the one with the right type and used in at least one
2313 // GetElementPtrInst.
2314 for (const User *U : I.users()) {
2315 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2316 Considerable = true;
2317 // A getelementptr is considered as "complex" if it has more than 2
2318 // operands. We will promote a SExt used in such complex GEP as we
2319 // expect some computation to be merged if they are done on 64 bits.
2320 if (GEPInst->getNumOperands() > 2) {
2321 AllowPromotionWithoutCommonHeader = true;
2326 return Considerable;
2329 bool AArch64TTIImpl::isLegalToVectorizeReduction(
2330 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
2331 if (!VF.isScalable())
2334 Type *Ty = RdxDesc.getRecurrenceType();
2335 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
2338 switch (RdxDesc.getRecurrenceKind()) {
2339 case RecurKind::Add:
2340 case RecurKind::FAdd:
2341 case RecurKind::And:
2343 case RecurKind::Xor:
2344 case RecurKind::SMin:
2345 case RecurKind::SMax:
2346 case RecurKind::UMin:
2347 case RecurKind::UMax:
2348 case RecurKind::FMin:
2349 case RecurKind::FMax:
2350 case RecurKind::SelectICmp:
2351 case RecurKind::SelectFCmp:
2352 case RecurKind::FMulAdd:
2360 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2362 TTI::TargetCostKind CostKind) {
2363 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
2365 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
2366 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2368 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&
2369 "Both vector needs to be equally scalable");
2371 InstructionCost LegalizationCost = 0;
2373 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2374 unsigned MinMaxOpcode =
2375 Ty->isFPOrFPVectorTy()
2377 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2378 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2379 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2382 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2385 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
2386 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2387 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2388 InstructionCost LegalizationCost = 0;
2390 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2391 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2392 LegalizationCost *= LT.first - 1;
2395 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2396 assert(ISD && "Invalid opcode");
2397 // Add the final reduction cost for the legal horizontal reduction
2404 return LegalizationCost + 2;
2406 return InstructionCost::getInvalid();
2411 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
2412 Optional<FastMathFlags> FMF,
2413 TTI::TargetCostKind CostKind) {
2414 if (TTI::requiresOrderedReduction(FMF)) {
2415 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2416 InstructionCost BaseCost =
2417 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2418 // Add on extra cost to reflect the extra overhead on some CPUs. We still
2419 // end up vectorizing for more computationally intensive loops.
2420 return BaseCost + FixedVTy->getNumElements();
2423 if (Opcode != Instruction::FAdd)
2424 return InstructionCost::getInvalid();
2426 auto *VTy = cast<ScalableVectorType>(ValTy);
2427 InstructionCost Cost =
2428 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
2429 Cost *= getMaxNumElements(VTy->getElementCount());
2433 if (isa<ScalableVectorType>(ValTy))
2434 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
2436 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2437 MVT MTy = LT.second;
2438 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2439 assert(ISD && "Invalid opcode");
2441 // Horizontal adds can use the 'addv' instruction. We model the cost of these
2442 // instructions as twice a normal vector add, plus 1 for each legalization
2443 // step (LT.first). This is the only arithmetic vector reduction operation for
2444 // which we have an instruction.
2445 // OR, XOR and AND costs should match the codegen from:
2446 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
2447 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
2448 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
2449 static const CostTblEntry CostTblNoPairwise[]{
2450 {ISD::ADD, MVT::v8i8, 2},
2451 {ISD::ADD, MVT::v16i8, 2},
2452 {ISD::ADD, MVT::v4i16, 2},
2453 {ISD::ADD, MVT::v8i16, 2},
2454 {ISD::ADD, MVT::v4i32, 2},
2455 {ISD::OR, MVT::v8i8, 15},
2456 {ISD::OR, MVT::v16i8, 17},
2457 {ISD::OR, MVT::v4i16, 7},
2458 {ISD::OR, MVT::v8i16, 9},
2459 {ISD::OR, MVT::v2i32, 3},
2460 {ISD::OR, MVT::v4i32, 5},
2461 {ISD::OR, MVT::v2i64, 3},
2462 {ISD::XOR, MVT::v8i8, 15},
2463 {ISD::XOR, MVT::v16i8, 17},
2464 {ISD::XOR, MVT::v4i16, 7},
2465 {ISD::XOR, MVT::v8i16, 9},
2466 {ISD::XOR, MVT::v2i32, 3},
2467 {ISD::XOR, MVT::v4i32, 5},
2468 {ISD::XOR, MVT::v2i64, 3},
2469 {ISD::AND, MVT::v8i8, 15},
2470 {ISD::AND, MVT::v16i8, 17},
2471 {ISD::AND, MVT::v4i16, 7},
2472 {ISD::AND, MVT::v8i16, 9},
2473 {ISD::AND, MVT::v2i32, 3},
2474 {ISD::AND, MVT::v4i32, 5},
2475 {ISD::AND, MVT::v2i64, 3},
2481 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
2482 return (LT.first - 1) + Entry->Cost;
2487 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
2490 auto *ValVTy = cast<FixedVectorType>(ValTy);
2491 if (!ValVTy->getElementType()->isIntegerTy(1) &&
2492 MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
2493 isPowerOf2_32(ValVTy->getNumElements())) {
2494 InstructionCost ExtraCost = 0;
2495 if (LT.first != 1) {
2496 // Type needs to be split, so there is an extra cost of LT.first - 1
2498 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
2499 MTy.getVectorNumElements());
2500 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
2501 ExtraCost *= LT.first - 1;
2503 return Entry->Cost + ExtraCost;
2507 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2510 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
2511 static const CostTblEntry ShuffleTbl[] = {
2512 { TTI::SK_Splice, MVT::nxv16i8, 1 },
2513 { TTI::SK_Splice, MVT::nxv8i16, 1 },
2514 { TTI::SK_Splice, MVT::nxv4i32, 1 },
2515 { TTI::SK_Splice, MVT::nxv2i64, 1 },
2516 { TTI::SK_Splice, MVT::nxv2f16, 1 },
2517 { TTI::SK_Splice, MVT::nxv4f16, 1 },
2518 { TTI::SK_Splice, MVT::nxv8f16, 1 },
2519 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
2520 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
2521 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
2522 { TTI::SK_Splice, MVT::nxv2f32, 1 },
2523 { TTI::SK_Splice, MVT::nxv4f32, 1 },
2524 { TTI::SK_Splice, MVT::nxv2f64, 1 },
2527 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2528 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
2529 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2530 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
2531 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
2533 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
2534 InstructionCost LegalizationCost = 0;
2537 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
2538 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2539 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
2540 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2543 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
2544 // Cost performed on a promoted type.
2545 if (LT.second.getScalarType() == MVT::i1) {
2547 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
2548 TTI::CastContextHint::None, CostKind) +
2549 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
2550 TTI::CastContextHint::None, CostKind);
2553 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
2554 assert(Entry && "Illegal Type for Splice");
2555 LegalizationCost += Entry->Cost;
2556 return LegalizationCost * LT.first;
2559 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
2561 ArrayRef<int> Mask, int Index,
2562 VectorType *SubTp) {
2563 Kind = improveShuffleKindFromMask(Kind, Mask);
2564 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
2565 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
2566 Kind == TTI::SK_Reverse) {
2567 static const CostTblEntry ShuffleTbl[] = {
2568 // Broadcast shuffle kinds can be performed with 'dup'.
2569 { TTI::SK_Broadcast, MVT::v8i8, 1 },
2570 { TTI::SK_Broadcast, MVT::v16i8, 1 },
2571 { TTI::SK_Broadcast, MVT::v4i16, 1 },
2572 { TTI::SK_Broadcast, MVT::v8i16, 1 },
2573 { TTI::SK_Broadcast, MVT::v2i32, 1 },
2574 { TTI::SK_Broadcast, MVT::v4i32, 1 },
2575 { TTI::SK_Broadcast, MVT::v2i64, 1 },
2576 { TTI::SK_Broadcast, MVT::v2f32, 1 },
2577 { TTI::SK_Broadcast, MVT::v4f32, 1 },
2578 { TTI::SK_Broadcast, MVT::v2f64, 1 },
2579 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
2580 // 'zip1/zip2' instructions.
2581 { TTI::SK_Transpose, MVT::v8i8, 1 },
2582 { TTI::SK_Transpose, MVT::v16i8, 1 },
2583 { TTI::SK_Transpose, MVT::v4i16, 1 },
2584 { TTI::SK_Transpose, MVT::v8i16, 1 },
2585 { TTI::SK_Transpose, MVT::v2i32, 1 },
2586 { TTI::SK_Transpose, MVT::v4i32, 1 },
2587 { TTI::SK_Transpose, MVT::v2i64, 1 },
2588 { TTI::SK_Transpose, MVT::v2f32, 1 },
2589 { TTI::SK_Transpose, MVT::v4f32, 1 },
2590 { TTI::SK_Transpose, MVT::v2f64, 1 },
2591 // Select shuffle kinds.
2592 // TODO: handle vXi8/vXi16.
2593 { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
2594 { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
2595 { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
2596 { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
2597 { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
2598 { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
2599 // PermuteSingleSrc shuffle kinds.
2600 { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
2601 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
2602 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
2603 { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
2604 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
2605 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
2606 { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
2607 { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
2608 { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
2609 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
2610 { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
2611 { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
2612 { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
2613 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
2614 // Reverse can be lowered with `rev`.
2615 { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
2616 { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
2617 { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
2618 { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
2619 { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
2620 { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
2621 // Broadcast shuffle kinds for scalable vectors
2622 { TTI::SK_Broadcast, MVT::nxv16i8, 1 },
2623 { TTI::SK_Broadcast, MVT::nxv8i16, 1 },
2624 { TTI::SK_Broadcast, MVT::nxv4i32, 1 },
2625 { TTI::SK_Broadcast, MVT::nxv2i64, 1 },
2626 { TTI::SK_Broadcast, MVT::nxv2f16, 1 },
2627 { TTI::SK_Broadcast, MVT::nxv4f16, 1 },
2628 { TTI::SK_Broadcast, MVT::nxv8f16, 1 },
2629 { TTI::SK_Broadcast, MVT::nxv2bf16, 1 },
2630 { TTI::SK_Broadcast, MVT::nxv4bf16, 1 },
2631 { TTI::SK_Broadcast, MVT::nxv8bf16, 1 },
2632 { TTI::SK_Broadcast, MVT::nxv2f32, 1 },
2633 { TTI::SK_Broadcast, MVT::nxv4f32, 1 },
2634 { TTI::SK_Broadcast, MVT::nxv2f64, 1 },
2635 { TTI::SK_Broadcast, MVT::nxv16i1, 1 },
2636 { TTI::SK_Broadcast, MVT::nxv8i1, 1 },
2637 { TTI::SK_Broadcast, MVT::nxv4i1, 1 },
2638 { TTI::SK_Broadcast, MVT::nxv2i1, 1 },
2639 // Handle the cases for vector.reverse with scalable vectors
2640 { TTI::SK_Reverse, MVT::nxv16i8, 1 },
2641 { TTI::SK_Reverse, MVT::nxv8i16, 1 },
2642 { TTI::SK_Reverse, MVT::nxv4i32, 1 },
2643 { TTI::SK_Reverse, MVT::nxv2i64, 1 },
2644 { TTI::SK_Reverse, MVT::nxv2f16, 1 },
2645 { TTI::SK_Reverse, MVT::nxv4f16, 1 },
2646 { TTI::SK_Reverse, MVT::nxv8f16, 1 },
2647 { TTI::SK_Reverse, MVT::nxv2bf16, 1 },
2648 { TTI::SK_Reverse, MVT::nxv4bf16, 1 },
2649 { TTI::SK_Reverse, MVT::nxv8bf16, 1 },
2650 { TTI::SK_Reverse, MVT::nxv2f32, 1 },
2651 { TTI::SK_Reverse, MVT::nxv4f32, 1 },
2652 { TTI::SK_Reverse, MVT::nxv2f64, 1 },
2653 { TTI::SK_Reverse, MVT::nxv16i1, 1 },
2654 { TTI::SK_Reverse, MVT::nxv8i1, 1 },
2655 { TTI::SK_Reverse, MVT::nxv4i1, 1 },
2656 { TTI::SK_Reverse, MVT::nxv2i1, 1 },
2658 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2659 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
2660 return LT.first * Entry->Cost;
2662 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
2663 return getSpliceCost(Tp, Index);
2664 return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);