1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the AArch64TargetLowering class.
11 //===----------------------------------------------------------------------===//
13 #include "AArch64ISelLowering.h"
14 #include "AArch64CallingConvention.h"
15 #include "AArch64ExpandImm.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "MCTargetDesc/AArch64AddressingModes.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
32 #include "llvm/Analysis/ObjCARCUtil.h"
33 #include "llvm/Analysis/VectorUtils.h"
34 #include "llvm/CodeGen/Analysis.h"
35 #include "llvm/CodeGen/CallingConvLower.h"
36 #include "llvm/CodeGen/MachineBasicBlock.h"
37 #include "llvm/CodeGen/MachineFrameInfo.h"
38 #include "llvm/CodeGen/MachineFunction.h"
39 #include "llvm/CodeGen/MachineInstr.h"
40 #include "llvm/CodeGen/MachineInstrBuilder.h"
41 #include "llvm/CodeGen/MachineMemOperand.h"
42 #include "llvm/CodeGen/MachineRegisterInfo.h"
43 #include "llvm/CodeGen/RuntimeLibcalls.h"
44 #include "llvm/CodeGen/SelectionDAG.h"
45 #include "llvm/CodeGen/SelectionDAGNodes.h"
46 #include "llvm/CodeGen/TargetCallingConv.h"
47 #include "llvm/CodeGen/TargetInstrInfo.h"
48 #include "llvm/CodeGen/ValueTypes.h"
49 #include "llvm/IR/Attributes.h"
50 #include "llvm/IR/Constants.h"
51 #include "llvm/IR/DataLayout.h"
52 #include "llvm/IR/DebugLoc.h"
53 #include "llvm/IR/DerivedTypes.h"
54 #include "llvm/IR/Function.h"
55 #include "llvm/IR/GetElementPtrTypeIterator.h"
56 #include "llvm/IR/GlobalValue.h"
57 #include "llvm/IR/IRBuilder.h"
58 #include "llvm/IR/Instruction.h"
59 #include "llvm/IR/Instructions.h"
60 #include "llvm/IR/IntrinsicInst.h"
61 #include "llvm/IR/Intrinsics.h"
62 #include "llvm/IR/IntrinsicsAArch64.h"
63 #include "llvm/IR/Module.h"
64 #include "llvm/IR/OperandTraits.h"
65 #include "llvm/IR/PatternMatch.h"
66 #include "llvm/IR/Type.h"
67 #include "llvm/IR/Use.h"
68 #include "llvm/IR/Value.h"
69 #include "llvm/MC/MCRegisterInfo.h"
70 #include "llvm/Support/Casting.h"
71 #include "llvm/Support/CodeGen.h"
72 #include "llvm/Support/CommandLine.h"
73 #include "llvm/Support/Compiler.h"
74 #include "llvm/Support/Debug.h"
75 #include "llvm/Support/ErrorHandling.h"
76 #include "llvm/Support/KnownBits.h"
77 #include "llvm/Support/MachineValueType.h"
78 #include "llvm/Support/MathExtras.h"
79 #include "llvm/Support/raw_ostream.h"
80 #include "llvm/Target/TargetMachine.h"
81 #include "llvm/Target/TargetOptions.h"
95 using namespace llvm::PatternMatch;
97 #define DEBUG_TYPE "aarch64-lower"
99 STATISTIC(NumTailCalls, "Number of tail calls");
100 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
101 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
103 // FIXME: The necessary dtprel relocations don't seem to be supported
104 // well in the GNU bfd and gold linkers at the moment. Therefore, by
105 // default, for now, fall back to GeneralDynamic code generation.
106 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
107 "aarch64-elf-ldtls-generation", cl::Hidden,
108 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
112 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
113 cl::desc("Enable AArch64 logical imm instruction "
117 // Temporary option added for the purpose of testing functionality added
118 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
119 // in future when both implementations will be based off MGATHER rather
120 // than the GLD1 nodes added for the SVE gather load intrinsics.
122 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
123 cl::desc("Combine extends of AArch64 masked "
124 "gather intrinsics"),
127 /// Value type used for condition codes.
128 static const MVT MVT_CC = MVT::i32;
130 static inline EVT getPackedSVEVectorVT(EVT VT) {
131 switch (VT.getSimpleVT().SimpleTy) {
133 llvm_unreachable("unexpected element type for vector");
149 return MVT::nxv8bf16;
153 // NOTE: Currently there's only a need to return integer vector types. If this
154 // changes then just add an extra "type" parameter.
155 static inline EVT getPackedSVEVectorVT(ElementCount EC) {
156 switch (EC.getKnownMinValue()) {
158 llvm_unreachable("unexpected element count for vector");
170 static inline EVT getPromotedVTForPredicate(EVT VT) {
171 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
172 "Expected scalable predicate vector type!");
173 switch (VT.getVectorMinNumElements()) {
175 llvm_unreachable("unexpected element count for vector");
187 /// Returns true if VT's elements occupy the lowest bit positions of its
188 /// associated register class without any intervening space.
190 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
191 /// same register class, but only nxv8f16 can be treated as a packed vector.
192 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
193 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
194 "Expected legal vector type!");
195 return VT.isFixedLengthVector() ||
196 VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
199 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
200 // predicate and end with a passthru value matching the result type.
201 static bool isMergePassthruOpcode(unsigned Opc) {
205 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
206 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
207 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
208 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
209 case AArch64ISD::DUP_MERGE_PASSTHRU:
210 case AArch64ISD::ABS_MERGE_PASSTHRU:
211 case AArch64ISD::NEG_MERGE_PASSTHRU:
212 case AArch64ISD::FNEG_MERGE_PASSTHRU:
213 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
214 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
215 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
216 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
217 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
218 case AArch64ISD::FRINT_MERGE_PASSTHRU:
219 case AArch64ISD::FROUND_MERGE_PASSTHRU:
220 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
221 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
222 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
223 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
224 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
225 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
226 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
227 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
228 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
229 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
230 case AArch64ISD::FABS_MERGE_PASSTHRU:
235 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
236 const AArch64Subtarget &STI)
237 : TargetLowering(TM), Subtarget(&STI) {
238 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
239 // we have to make something up. Arbitrarily, choose ZeroOrOne.
240 setBooleanContents(ZeroOrOneBooleanContent);
241 // When comparing vectors the result sets the different elements in the
242 // vector to all-one or all-zero.
243 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
245 // Set up the register classes.
246 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
247 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
249 if (Subtarget->hasLS64()) {
250 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
251 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
252 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
255 if (Subtarget->hasFPARMv8()) {
256 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
257 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
258 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
259 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
260 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
263 if (Subtarget->hasNEON()) {
264 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
265 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
266 // Someone set us up the NEON.
267 addDRTypeForNEON(MVT::v2f32);
268 addDRTypeForNEON(MVT::v8i8);
269 addDRTypeForNEON(MVT::v4i16);
270 addDRTypeForNEON(MVT::v2i32);
271 addDRTypeForNEON(MVT::v1i64);
272 addDRTypeForNEON(MVT::v1f64);
273 addDRTypeForNEON(MVT::v4f16);
274 if (Subtarget->hasBF16())
275 addDRTypeForNEON(MVT::v4bf16);
277 addQRTypeForNEON(MVT::v4f32);
278 addQRTypeForNEON(MVT::v2f64);
279 addQRTypeForNEON(MVT::v16i8);
280 addQRTypeForNEON(MVT::v8i16);
281 addQRTypeForNEON(MVT::v4i32);
282 addQRTypeForNEON(MVT::v2i64);
283 addQRTypeForNEON(MVT::v8f16);
284 if (Subtarget->hasBF16())
285 addQRTypeForNEON(MVT::v8bf16);
288 if (Subtarget->hasSVE()) {
289 // Add legal sve predicate types
290 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
291 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
292 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
293 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
295 // Add legal sve data types
296 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
297 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
298 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
299 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
301 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
302 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
303 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
304 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
305 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
306 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
308 if (Subtarget->hasBF16()) {
309 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
310 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
311 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
314 if (Subtarget->useSVEForFixedLengthVectors()) {
315 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
316 if (useSVEForFixedLengthVectorVT(VT))
317 addRegisterClass(VT, &AArch64::ZPRRegClass);
319 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
320 if (useSVEForFixedLengthVectorVT(VT))
321 addRegisterClass(VT, &AArch64::ZPRRegClass);
324 for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
325 setOperationAction(ISD::SADDSAT, VT, Legal);
326 setOperationAction(ISD::UADDSAT, VT, Legal);
327 setOperationAction(ISD::SSUBSAT, VT, Legal);
328 setOperationAction(ISD::USUBSAT, VT, Legal);
329 setOperationAction(ISD::UREM, VT, Expand);
330 setOperationAction(ISD::SREM, VT, Expand);
331 setOperationAction(ISD::SDIVREM, VT, Expand);
332 setOperationAction(ISD::UDIVREM, VT, Expand);
336 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
337 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
338 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
341 { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
343 setCondCodeAction(ISD::SETO, VT, Expand);
344 setCondCodeAction(ISD::SETOLT, VT, Expand);
345 setCondCodeAction(ISD::SETLT, VT, Expand);
346 setCondCodeAction(ISD::SETOLE, VT, Expand);
347 setCondCodeAction(ISD::SETLE, VT, Expand);
348 setCondCodeAction(ISD::SETULT, VT, Expand);
349 setCondCodeAction(ISD::SETULE, VT, Expand);
350 setCondCodeAction(ISD::SETUGE, VT, Expand);
351 setCondCodeAction(ISD::SETUGT, VT, Expand);
352 setCondCodeAction(ISD::SETUEQ, VT, Expand);
353 setCondCodeAction(ISD::SETUNE, VT, Expand);
355 setOperationAction(ISD::FREM, VT, Expand);
356 setOperationAction(ISD::FPOW, VT, Expand);
357 setOperationAction(ISD::FPOWI, VT, Expand);
358 setOperationAction(ISD::FCOS, VT, Expand);
359 setOperationAction(ISD::FSIN, VT, Expand);
360 setOperationAction(ISD::FSINCOS, VT, Expand);
361 setOperationAction(ISD::FEXP, VT, Expand);
362 setOperationAction(ISD::FEXP2, VT, Expand);
363 setOperationAction(ISD::FLOG, VT, Expand);
364 setOperationAction(ISD::FLOG2, VT, Expand);
365 setOperationAction(ISD::FLOG10, VT, Expand);
369 // Compute derived properties from the register classes
370 computeRegisterProperties(Subtarget->getRegisterInfo());
372 // Provide all sorts of operation actions
373 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
374 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
375 setOperationAction(ISD::SETCC, MVT::i32, Custom);
376 setOperationAction(ISD::SETCC, MVT::i64, Custom);
377 setOperationAction(ISD::SETCC, MVT::f16, Custom);
378 setOperationAction(ISD::SETCC, MVT::f32, Custom);
379 setOperationAction(ISD::SETCC, MVT::f64, Custom);
380 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
381 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
382 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
383 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
384 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
385 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
386 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
387 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
388 setOperationAction(ISD::BRCOND, MVT::Other, Expand);
389 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
390 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
391 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
392 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
393 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
394 setOperationAction(ISD::SELECT, MVT::i32, Custom);
395 setOperationAction(ISD::SELECT, MVT::i64, Custom);
396 setOperationAction(ISD::SELECT, MVT::f16, Custom);
397 setOperationAction(ISD::SELECT, MVT::f32, Custom);
398 setOperationAction(ISD::SELECT, MVT::f64, Custom);
399 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
400 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
401 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
402 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
403 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
404 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
405 setOperationAction(ISD::JumpTable, MVT::i64, Custom);
407 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
408 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
409 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
411 setOperationAction(ISD::FREM, MVT::f32, Expand);
412 setOperationAction(ISD::FREM, MVT::f64, Expand);
413 setOperationAction(ISD::FREM, MVT::f80, Expand);
415 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
417 // Custom lowering hooks are needed for XOR
418 // to fold it into CSINC/CSINV.
419 setOperationAction(ISD::XOR, MVT::i32, Custom);
420 setOperationAction(ISD::XOR, MVT::i64, Custom);
422 // Virtually no operation on f128 is legal, but LLVM can't expand them when
423 // there's a valid register class, so we need custom operations in most cases.
424 setOperationAction(ISD::FABS, MVT::f128, Expand);
425 setOperationAction(ISD::FADD, MVT::f128, LibCall);
426 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
427 setOperationAction(ISD::FCOS, MVT::f128, Expand);
428 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
429 setOperationAction(ISD::FMA, MVT::f128, Expand);
430 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
431 setOperationAction(ISD::FNEG, MVT::f128, Expand);
432 setOperationAction(ISD::FPOW, MVT::f128, Expand);
433 setOperationAction(ISD::FREM, MVT::f128, Expand);
434 setOperationAction(ISD::FRINT, MVT::f128, Expand);
435 setOperationAction(ISD::FSIN, MVT::f128, Expand);
436 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
437 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
438 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
439 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
440 setOperationAction(ISD::SETCC, MVT::f128, Custom);
441 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
442 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
443 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
444 setOperationAction(ISD::SELECT, MVT::f128, Custom);
445 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
446 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
448 // Lowering for many of the conversions is actually specified by the non-f128
449 // type. The LowerXXX function will be trivial when f128 isn't involved.
450 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
451 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
452 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
453 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
454 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
455 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
456 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
457 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
458 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
459 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
460 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
461 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
462 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
463 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
464 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
465 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
466 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
467 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
468 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
469 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
470 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
471 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
472 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
473 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
474 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
475 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
476 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
477 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
478 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
479 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
481 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
482 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
483 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
484 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
486 // Variable arguments.
487 setOperationAction(ISD::VASTART, MVT::Other, Custom);
488 setOperationAction(ISD::VAARG, MVT::Other, Custom);
489 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
490 setOperationAction(ISD::VAEND, MVT::Other, Expand);
492 // Variable-sized objects.
493 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
494 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
496 if (Subtarget->isTargetWindows())
497 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
499 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
501 // Constant pool entries
502 setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
505 setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
507 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
508 setOperationAction(ISD::ADDC, MVT::i32, Custom);
509 setOperationAction(ISD::ADDE, MVT::i32, Custom);
510 setOperationAction(ISD::SUBC, MVT::i32, Custom);
511 setOperationAction(ISD::SUBE, MVT::i32, Custom);
512 setOperationAction(ISD::ADDC, MVT::i64, Custom);
513 setOperationAction(ISD::ADDE, MVT::i64, Custom);
514 setOperationAction(ISD::SUBC, MVT::i64, Custom);
515 setOperationAction(ISD::SUBE, MVT::i64, Custom);
517 // AArch64 lacks both left-rotate and popcount instructions.
518 setOperationAction(ISD::ROTL, MVT::i32, Expand);
519 setOperationAction(ISD::ROTL, MVT::i64, Expand);
520 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
521 setOperationAction(ISD::ROTL, VT, Expand);
522 setOperationAction(ISD::ROTR, VT, Expand);
525 // AArch64 doesn't have i32 MULH{S|U}.
526 setOperationAction(ISD::MULHU, MVT::i32, Expand);
527 setOperationAction(ISD::MULHS, MVT::i32, Expand);
529 // AArch64 doesn't have {U|S}MUL_LOHI.
530 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
531 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
533 setOperationAction(ISD::CTPOP, MVT::i32, Custom);
534 setOperationAction(ISD::CTPOP, MVT::i64, Custom);
535 setOperationAction(ISD::CTPOP, MVT::i128, Custom);
537 setOperationAction(ISD::ABS, MVT::i32, Custom);
538 setOperationAction(ISD::ABS, MVT::i64, Custom);
540 setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
541 setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
542 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
543 setOperationAction(ISD::SDIVREM, VT, Expand);
544 setOperationAction(ISD::UDIVREM, VT, Expand);
546 setOperationAction(ISD::SREM, MVT::i32, Expand);
547 setOperationAction(ISD::SREM, MVT::i64, Expand);
548 setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
549 setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
550 setOperationAction(ISD::UREM, MVT::i32, Expand);
551 setOperationAction(ISD::UREM, MVT::i64, Expand);
553 // Custom lower Add/Sub/Mul with overflow.
554 setOperationAction(ISD::SADDO, MVT::i32, Custom);
555 setOperationAction(ISD::SADDO, MVT::i64, Custom);
556 setOperationAction(ISD::UADDO, MVT::i32, Custom);
557 setOperationAction(ISD::UADDO, MVT::i64, Custom);
558 setOperationAction(ISD::SSUBO, MVT::i32, Custom);
559 setOperationAction(ISD::SSUBO, MVT::i64, Custom);
560 setOperationAction(ISD::USUBO, MVT::i32, Custom);
561 setOperationAction(ISD::USUBO, MVT::i64, Custom);
562 setOperationAction(ISD::SMULO, MVT::i32, Custom);
563 setOperationAction(ISD::SMULO, MVT::i64, Custom);
564 setOperationAction(ISD::UMULO, MVT::i32, Custom);
565 setOperationAction(ISD::UMULO, MVT::i64, Custom);
567 setOperationAction(ISD::FSIN, MVT::f32, Expand);
568 setOperationAction(ISD::FSIN, MVT::f64, Expand);
569 setOperationAction(ISD::FCOS, MVT::f32, Expand);
570 setOperationAction(ISD::FCOS, MVT::f64, Expand);
571 setOperationAction(ISD::FPOW, MVT::f32, Expand);
572 setOperationAction(ISD::FPOW, MVT::f64, Expand);
573 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
574 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
575 if (Subtarget->hasFullFP16())
576 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
578 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
580 setOperationAction(ISD::FREM, MVT::f16, Promote);
581 setOperationAction(ISD::FREM, MVT::v4f16, Expand);
582 setOperationAction(ISD::FREM, MVT::v8f16, Expand);
583 setOperationAction(ISD::FPOW, MVT::f16, Promote);
584 setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
585 setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
586 setOperationAction(ISD::FPOWI, MVT::f16, Promote);
587 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
588 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
589 setOperationAction(ISD::FCOS, MVT::f16, Promote);
590 setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
591 setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
592 setOperationAction(ISD::FSIN, MVT::f16, Promote);
593 setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
594 setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
595 setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
596 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
597 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
598 setOperationAction(ISD::FEXP, MVT::f16, Promote);
599 setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
600 setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
601 setOperationAction(ISD::FEXP2, MVT::f16, Promote);
602 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
603 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
604 setOperationAction(ISD::FLOG, MVT::f16, Promote);
605 setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
606 setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
607 setOperationAction(ISD::FLOG2, MVT::f16, Promote);
608 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
609 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
610 setOperationAction(ISD::FLOG10, MVT::f16, Promote);
611 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
612 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
614 if (!Subtarget->hasFullFP16()) {
615 setOperationAction(ISD::SELECT, MVT::f16, Promote);
616 setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
617 setOperationAction(ISD::SETCC, MVT::f16, Promote);
618 setOperationAction(ISD::BR_CC, MVT::f16, Promote);
619 setOperationAction(ISD::FADD, MVT::f16, Promote);
620 setOperationAction(ISD::FSUB, MVT::f16, Promote);
621 setOperationAction(ISD::FMUL, MVT::f16, Promote);
622 setOperationAction(ISD::FDIV, MVT::f16, Promote);
623 setOperationAction(ISD::FMA, MVT::f16, Promote);
624 setOperationAction(ISD::FNEG, MVT::f16, Promote);
625 setOperationAction(ISD::FABS, MVT::f16, Promote);
626 setOperationAction(ISD::FCEIL, MVT::f16, Promote);
627 setOperationAction(ISD::FSQRT, MVT::f16, Promote);
628 setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
629 setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
630 setOperationAction(ISD::FRINT, MVT::f16, Promote);
631 setOperationAction(ISD::FROUND, MVT::f16, Promote);
632 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
633 setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
634 setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
635 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
636 setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
637 setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
639 // promote v4f16 to v4f32 when that is known to be safe.
640 setOperationAction(ISD::FADD, MVT::v4f16, Promote);
641 setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
642 setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
643 setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
644 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
645 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
646 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
647 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
649 setOperationAction(ISD::FABS, MVT::v4f16, Expand);
650 setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
651 setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
652 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand);
653 setOperationAction(ISD::FMA, MVT::v4f16, Expand);
654 setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
655 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
656 setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
657 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
658 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
659 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
660 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
661 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
662 setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
663 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
664 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
666 setOperationAction(ISD::FABS, MVT::v8f16, Expand);
667 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
668 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
669 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
670 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
671 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
672 setOperationAction(ISD::FMA, MVT::v8f16, Expand);
673 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
674 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
675 setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
676 setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
677 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand);
678 setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
679 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
680 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
681 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
682 setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
683 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
684 setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
685 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
686 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
689 // AArch64 has implementations of a lot of rounding-like FP operations.
690 for (MVT Ty : {MVT::f32, MVT::f64}) {
691 setOperationAction(ISD::FFLOOR, Ty, Legal);
692 setOperationAction(ISD::FNEARBYINT, Ty, Legal);
693 setOperationAction(ISD::FCEIL, Ty, Legal);
694 setOperationAction(ISD::FRINT, Ty, Legal);
695 setOperationAction(ISD::FTRUNC, Ty, Legal);
696 setOperationAction(ISD::FROUND, Ty, Legal);
697 setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
698 setOperationAction(ISD::FMINNUM, Ty, Legal);
699 setOperationAction(ISD::FMAXNUM, Ty, Legal);
700 setOperationAction(ISD::FMINIMUM, Ty, Legal);
701 setOperationAction(ISD::FMAXIMUM, Ty, Legal);
702 setOperationAction(ISD::LROUND, Ty, Legal);
703 setOperationAction(ISD::LLROUND, Ty, Legal);
704 setOperationAction(ISD::LRINT, Ty, Legal);
705 setOperationAction(ISD::LLRINT, Ty, Legal);
708 if (Subtarget->hasFullFP16()) {
709 setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
710 setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
711 setOperationAction(ISD::FCEIL, MVT::f16, Legal);
712 setOperationAction(ISD::FRINT, MVT::f16, Legal);
713 setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
714 setOperationAction(ISD::FROUND, MVT::f16, Legal);
715 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
716 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
717 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
718 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
719 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
722 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
724 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
725 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
727 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
728 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
729 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
730 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
731 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
733 // Generate outline atomics library calls only if LSE was not specified for
735 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
736 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
737 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
738 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
739 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
740 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
741 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
742 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
743 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
744 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
745 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
746 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
747 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
748 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
749 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
750 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
751 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
752 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
753 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
754 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
755 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
756 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
757 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
758 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
759 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
760 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
761 #define LCALLNAMES(A, B, N) \
762 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
763 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
764 setLibcallName(A##N##_REL, #B #N "_rel"); \
765 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
766 #define LCALLNAME4(A, B) \
767 LCALLNAMES(A, B, 1) \
768 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
769 #define LCALLNAME5(A, B) \
770 LCALLNAMES(A, B, 1) \
771 LCALLNAMES(A, B, 2) \
772 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
773 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
774 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
775 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
776 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
777 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
778 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
784 // 128-bit loads and stores can be done without expanding
785 setOperationAction(ISD::LOAD, MVT::i128, Custom);
786 setOperationAction(ISD::STORE, MVT::i128, Custom);
788 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
789 // custom lowering, as there are no un-paired non-temporal stores and
790 // legalization will break up 256 bit inputs.
791 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
792 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
793 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
794 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
795 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
796 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
797 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
799 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
800 // This requires the Performance Monitors extension.
801 if (Subtarget->hasPerfMon())
802 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
804 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
805 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
806 // Issue __sincos_stret if available.
807 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
808 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
810 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
811 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
814 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
815 // MSVCRT doesn't have powi; fall back to pow
816 setLibcallName(RTLIB::POWI_F32, nullptr);
817 setLibcallName(RTLIB::POWI_F64, nullptr);
820 // Make floating-point constants legal for the large code model, so they don't
821 // become loads from the constant pool.
822 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
823 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
824 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
827 // AArch64 does not have floating-point extending loads, i1 sign-extending
828 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
829 for (MVT VT : MVT::fp_valuetypes()) {
830 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
831 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
832 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
833 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
835 for (MVT VT : MVT::integer_valuetypes())
836 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
838 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
839 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
840 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
841 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
842 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
843 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
844 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
846 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
847 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
848 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
850 // Indexed loads and stores are supported.
851 for (unsigned im = (unsigned)ISD::PRE_INC;
852 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
853 setIndexedLoadAction(im, MVT::i8, Legal);
854 setIndexedLoadAction(im, MVT::i16, Legal);
855 setIndexedLoadAction(im, MVT::i32, Legal);
856 setIndexedLoadAction(im, MVT::i64, Legal);
857 setIndexedLoadAction(im, MVT::f64, Legal);
858 setIndexedLoadAction(im, MVT::f32, Legal);
859 setIndexedLoadAction(im, MVT::f16, Legal);
860 setIndexedLoadAction(im, MVT::bf16, Legal);
861 setIndexedStoreAction(im, MVT::i8, Legal);
862 setIndexedStoreAction(im, MVT::i16, Legal);
863 setIndexedStoreAction(im, MVT::i32, Legal);
864 setIndexedStoreAction(im, MVT::i64, Legal);
865 setIndexedStoreAction(im, MVT::f64, Legal);
866 setIndexedStoreAction(im, MVT::f32, Legal);
867 setIndexedStoreAction(im, MVT::f16, Legal);
868 setIndexedStoreAction(im, MVT::bf16, Legal);
872 setOperationAction(ISD::TRAP, MVT::Other, Legal);
873 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
874 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
876 // We combine OR nodes for bitfield operations.
877 setTargetDAGCombine(ISD::OR);
878 // Try to create BICs for vector ANDs.
879 setTargetDAGCombine(ISD::AND);
881 // Vector add and sub nodes may conceal a high-half opportunity.
882 // Also, try to fold ADD into CSINC/CSINV..
883 setTargetDAGCombine(ISD::ADD);
884 setTargetDAGCombine(ISD::ABS);
885 setTargetDAGCombine(ISD::SUB);
886 setTargetDAGCombine(ISD::SRL);
887 setTargetDAGCombine(ISD::XOR);
888 setTargetDAGCombine(ISD::SINT_TO_FP);
889 setTargetDAGCombine(ISD::UINT_TO_FP);
891 // TODO: Do the same for FP_TO_*INT_SAT.
892 setTargetDAGCombine(ISD::FP_TO_SINT);
893 setTargetDAGCombine(ISD::FP_TO_UINT);
894 setTargetDAGCombine(ISD::FDIV);
896 // Try and combine setcc with csel
897 setTargetDAGCombine(ISD::SETCC);
899 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
901 setTargetDAGCombine(ISD::ANY_EXTEND);
902 setTargetDAGCombine(ISD::ZERO_EXTEND);
903 setTargetDAGCombine(ISD::SIGN_EXTEND);
904 setTargetDAGCombine(ISD::VECTOR_SPLICE);
905 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
906 setTargetDAGCombine(ISD::TRUNCATE);
907 setTargetDAGCombine(ISD::CONCAT_VECTORS);
908 setTargetDAGCombine(ISD::STORE);
909 if (Subtarget->supportsAddressTopByteIgnored())
910 setTargetDAGCombine(ISD::LOAD);
912 setTargetDAGCombine(ISD::MUL);
914 setTargetDAGCombine(ISD::SELECT);
915 setTargetDAGCombine(ISD::VSELECT);
917 setTargetDAGCombine(ISD::INTRINSIC_VOID);
918 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
919 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
920 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
921 setTargetDAGCombine(ISD::VECREDUCE_ADD);
922 setTargetDAGCombine(ISD::STEP_VECTOR);
924 setTargetDAGCombine(ISD::GlobalAddress);
926 // In case of strict alignment, avoid an excessive number of byte wide stores.
927 MaxStoresPerMemsetOptSize = 8;
928 MaxStoresPerMemset = Subtarget->requiresStrictAlign()
929 ? MaxStoresPerMemsetOptSize : 32;
931 MaxGluedStoresPerMemcpy = 4;
932 MaxStoresPerMemcpyOptSize = 4;
933 MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
934 ? MaxStoresPerMemcpyOptSize : 16;
936 MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
938 MaxLoadsPerMemcmpOptSize = 4;
939 MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
940 ? MaxLoadsPerMemcmpOptSize : 8;
942 setStackPointerRegisterToSaveRestore(AArch64::SP);
944 setSchedulingPreference(Sched::Hybrid);
946 EnableExtLdPromotion = true;
948 // Set required alignment.
949 setMinFunctionAlignment(Align(4));
950 // Set preferred alignments.
951 setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
952 setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
954 // Only change the limit for entries in a jump table if specified by
955 // the sub target, but not at the command line.
956 unsigned MaxJT = STI.getMaximumJumpTableSize();
957 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
958 setMaximumJumpTableSize(MaxJT);
960 setHasExtractBitsInsn(true);
962 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
964 if (Subtarget->hasNEON()) {
965 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
966 // silliness like this:
967 setOperationAction(ISD::FABS, MVT::v1f64, Expand);
968 setOperationAction(ISD::FADD, MVT::v1f64, Expand);
969 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
970 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
971 setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
972 setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
973 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
974 setOperationAction(ISD::FMA, MVT::v1f64, Expand);
975 setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
976 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
977 setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
978 setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
979 setOperationAction(ISD::FREM, MVT::v1f64, Expand);
980 setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
981 setOperationAction(ISD::FROUNDEVEN, MVT::v1f64, Expand);
982 setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
983 setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
984 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
985 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
986 setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
987 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
988 setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
989 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
990 setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
991 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
992 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
994 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
995 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
996 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
997 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
998 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
1000 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
1002 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1003 // elements smaller than i32, so promote the input to i32 first.
1004 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1005 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1006 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1007 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1008 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1009 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1011 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1012 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1013 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1014 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
1015 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
1016 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1017 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1018 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
1019 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1021 if (Subtarget->hasFullFP16()) {
1022 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1023 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1024 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1025 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1027 // when AArch64 doesn't have fullfp16 support, promote the input
1029 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1030 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1031 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1032 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1035 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1036 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1037 setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1038 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1039 setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1040 setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1041 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1042 setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1044 // AArch64 doesn't have MUL.2d:
1045 setOperationAction(ISD::MUL, MVT::v2i64, Expand);
1046 // Custom handling for some quad-vector types to detect MULL.
1047 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1048 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1049 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1052 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1053 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1054 setOperationAction(ISD::SADDSAT, VT, Legal);
1055 setOperationAction(ISD::UADDSAT, VT, Legal);
1056 setOperationAction(ISD::SSUBSAT, VT, Legal);
1057 setOperationAction(ISD::USUBSAT, VT, Legal);
1060 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1062 setOperationAction(ISD::ABDS, VT, Legal);
1063 setOperationAction(ISD::ABDU, VT, Legal);
1066 // Vector reductions
1067 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1068 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1069 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1070 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1071 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1073 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1076 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1077 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1078 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1079 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1080 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1081 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1082 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1084 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1086 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1087 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1088 // Likewise, narrowing and extending vector loads/stores aren't handled
1090 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1091 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1093 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1094 setOperationAction(ISD::MULHS, VT, Legal);
1095 setOperationAction(ISD::MULHU, VT, Legal);
1097 setOperationAction(ISD::MULHS, VT, Expand);
1098 setOperationAction(ISD::MULHU, VT, Expand);
1100 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1101 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1103 setOperationAction(ISD::BSWAP, VT, Expand);
1104 setOperationAction(ISD::CTTZ, VT, Expand);
1106 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1107 setTruncStoreAction(VT, InnerVT, Expand);
1108 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1109 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1110 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1114 // AArch64 has implementations of a lot of rounding-like FP operations.
1115 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
1116 setOperationAction(ISD::FFLOOR, Ty, Legal);
1117 setOperationAction(ISD::FNEARBYINT, Ty, Legal);
1118 setOperationAction(ISD::FCEIL, Ty, Legal);
1119 setOperationAction(ISD::FRINT, Ty, Legal);
1120 setOperationAction(ISD::FTRUNC, Ty, Legal);
1121 setOperationAction(ISD::FROUND, Ty, Legal);
1122 setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
1125 if (Subtarget->hasFullFP16()) {
1126 for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
1127 setOperationAction(ISD::FFLOOR, Ty, Legal);
1128 setOperationAction(ISD::FNEARBYINT, Ty, Legal);
1129 setOperationAction(ISD::FCEIL, Ty, Legal);
1130 setOperationAction(ISD::FRINT, Ty, Legal);
1131 setOperationAction(ISD::FTRUNC, Ty, Legal);
1132 setOperationAction(ISD::FROUND, Ty, Legal);
1133 setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
1137 if (Subtarget->hasSVE())
1138 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1140 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1142 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1143 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1144 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1145 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1146 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1147 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1150 if (Subtarget->hasSVE()) {
1151 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1152 setOperationAction(ISD::BITREVERSE, VT, Custom);
1153 setOperationAction(ISD::BSWAP, VT, Custom);
1154 setOperationAction(ISD::CTLZ, VT, Custom);
1155 setOperationAction(ISD::CTPOP, VT, Custom);
1156 setOperationAction(ISD::CTTZ, VT, Custom);
1157 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1158 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1159 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1160 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1161 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1162 setOperationAction(ISD::MGATHER, VT, Custom);
1163 setOperationAction(ISD::MSCATTER, VT, Custom);
1164 setOperationAction(ISD::MLOAD, VT, Custom);
1165 setOperationAction(ISD::MUL, VT, Custom);
1166 setOperationAction(ISD::MULHS, VT, Custom);
1167 setOperationAction(ISD::MULHU, VT, Custom);
1168 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1169 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1170 setOperationAction(ISD::SELECT, VT, Custom);
1171 setOperationAction(ISD::SETCC, VT, Custom);
1172 setOperationAction(ISD::SDIV, VT, Custom);
1173 setOperationAction(ISD::UDIV, VT, Custom);
1174 setOperationAction(ISD::SMIN, VT, Custom);
1175 setOperationAction(ISD::UMIN, VT, Custom);
1176 setOperationAction(ISD::SMAX, VT, Custom);
1177 setOperationAction(ISD::UMAX, VT, Custom);
1178 setOperationAction(ISD::SHL, VT, Custom);
1179 setOperationAction(ISD::SRL, VT, Custom);
1180 setOperationAction(ISD::SRA, VT, Custom);
1181 setOperationAction(ISD::ABS, VT, Custom);
1182 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1183 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1184 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1185 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1186 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1187 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1188 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1189 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1191 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1192 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1193 setOperationAction(ISD::SELECT_CC, VT, Expand);
1194 setOperationAction(ISD::ROTL, VT, Expand);
1195 setOperationAction(ISD::ROTR, VT, Expand);
1198 // Illegal unpacked integer vector types.
1199 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1200 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1201 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1204 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1205 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1206 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1207 setOperationAction(ISD::BITCAST, VT, Custom);
1209 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1210 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1211 setOperationAction(ISD::SELECT, VT, Custom);
1212 setOperationAction(ISD::SETCC, VT, Custom);
1213 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1214 setOperationAction(ISD::TRUNCATE, VT, Custom);
1215 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1216 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1217 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1219 setOperationAction(ISD::SELECT_CC, VT, Expand);
1220 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1221 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1223 // There are no legal MVT::nxv16f## based types.
1224 if (VT != MVT::nxv16i1) {
1225 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1226 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1230 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1231 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1232 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1233 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1234 setOperationAction(ISD::MLOAD, VT, Custom);
1235 setOperationAction(ISD::MSTORE, VT, Custom);
1236 setOperationAction(ISD::MGATHER, VT, Custom);
1237 setOperationAction(ISD::MSCATTER, VT, Custom);
1240 for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
1241 for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) {
1242 // Avoid marking truncating FP stores as legal to prevent the
1243 // DAGCombiner from creating unsupported truncating stores.
1244 setTruncStoreAction(VT, InnerVT, Expand);
1245 // SVE does not have floating-point extending loads.
1246 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1247 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1248 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1252 // SVE supports truncating stores of 64 and 128-bit vectors
1253 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1254 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1255 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1256 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1257 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1259 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1260 MVT::nxv4f32, MVT::nxv2f64}) {
1261 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1262 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1263 setOperationAction(ISD::MGATHER, VT, Custom);
1264 setOperationAction(ISD::MSCATTER, VT, Custom);
1265 setOperationAction(ISD::MLOAD, VT, Custom);
1266 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1267 setOperationAction(ISD::SELECT, VT, Custom);
1268 setOperationAction(ISD::FADD, VT, Custom);
1269 setOperationAction(ISD::FDIV, VT, Custom);
1270 setOperationAction(ISD::FMA, VT, Custom);
1271 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1272 setOperationAction(ISD::FMAXNUM, VT, Custom);
1273 setOperationAction(ISD::FMINIMUM, VT, Custom);
1274 setOperationAction(ISD::FMINNUM, VT, Custom);
1275 setOperationAction(ISD::FMUL, VT, Custom);
1276 setOperationAction(ISD::FNEG, VT, Custom);
1277 setOperationAction(ISD::FSUB, VT, Custom);
1278 setOperationAction(ISD::FCEIL, VT, Custom);
1279 setOperationAction(ISD::FFLOOR, VT, Custom);
1280 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1281 setOperationAction(ISD::FRINT, VT, Custom);
1282 setOperationAction(ISD::FROUND, VT, Custom);
1283 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1284 setOperationAction(ISD::FTRUNC, VT, Custom);
1285 setOperationAction(ISD::FSQRT, VT, Custom);
1286 setOperationAction(ISD::FABS, VT, Custom);
1287 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1288 setOperationAction(ISD::FP_ROUND, VT, Custom);
1289 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1290 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1291 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1292 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1293 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1295 setOperationAction(ISD::SELECT_CC, VT, Expand);
1298 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1299 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1300 setOperationAction(ISD::MGATHER, VT, Custom);
1301 setOperationAction(ISD::MSCATTER, VT, Custom);
1302 setOperationAction(ISD::MLOAD, VT, Custom);
1305 setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
1307 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1308 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1310 // NOTE: Currently this has to happen after computeRegisterProperties rather
1311 // than the preferred option of combining it with the addRegisterClass call.
1312 if (Subtarget->useSVEForFixedLengthVectors()) {
1313 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
1314 if (useSVEForFixedLengthVectorVT(VT))
1315 addTypeForFixedLengthSVE(VT);
1316 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
1317 if (useSVEForFixedLengthVectorVT(VT))
1318 addTypeForFixedLengthSVE(VT);
1320 // 64bit results can mean a bigger than NEON input.
1321 for (auto VT : {MVT::v8i8, MVT::v4i16})
1322 setOperationAction(ISD::TRUNCATE, VT, Custom);
1323 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1325 // 128bit results imply a bigger than NEON input.
1326 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1327 setOperationAction(ISD::TRUNCATE, VT, Custom);
1328 for (auto VT : {MVT::v8f16, MVT::v4f32})
1329 setOperationAction(ISD::FP_ROUND, VT, Custom);
1331 // These operations are not supported on NEON but SVE can do them.
1332 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1333 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1334 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1335 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1336 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1337 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1338 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1339 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1340 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1341 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1342 setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
1343 setOperationAction(ISD::SDIV, MVT::v16i8, Custom);
1344 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
1345 setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
1346 setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
1347 setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
1348 setOperationAction(ISD::SDIV, MVT::v1i64, Custom);
1349 setOperationAction(ISD::SDIV, MVT::v2i64, Custom);
1350 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1351 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1352 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1353 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1354 setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
1355 setOperationAction(ISD::UDIV, MVT::v16i8, Custom);
1356 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
1357 setOperationAction(ISD::UDIV, MVT::v8i16, Custom);
1358 setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
1359 setOperationAction(ISD::UDIV, MVT::v4i32, Custom);
1360 setOperationAction(ISD::UDIV, MVT::v1i64, Custom);
1361 setOperationAction(ISD::UDIV, MVT::v2i64, Custom);
1362 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1363 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1364 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1365 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1366 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1367 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1368 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1369 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1371 // Int operations with no NEON support.
1372 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1373 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1374 setOperationAction(ISD::BITREVERSE, VT, Custom);
1375 setOperationAction(ISD::CTTZ, VT, Custom);
1376 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1377 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1378 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1381 // FP operations with no NEON support.
1382 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1383 MVT::v1f64, MVT::v2f64})
1384 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1386 // Use SVE for vectors with more than 2 elements.
1387 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1388 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1391 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1392 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1393 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1394 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1397 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1400 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1401 assert(VT.isVector() && "VT should be a vector type");
1403 if (VT.isFloatingPoint()) {
1404 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1405 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1406 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1409 // Mark vector float intrinsics as expand.
1410 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1411 setOperationAction(ISD::FSIN, VT, Expand);
1412 setOperationAction(ISD::FCOS, VT, Expand);
1413 setOperationAction(ISD::FPOW, VT, Expand);
1414 setOperationAction(ISD::FLOG, VT, Expand);
1415 setOperationAction(ISD::FLOG2, VT, Expand);
1416 setOperationAction(ISD::FLOG10, VT, Expand);
1417 setOperationAction(ISD::FEXP, VT, Expand);
1418 setOperationAction(ISD::FEXP2, VT, Expand);
1421 // But we do support custom-lowering for FCOPYSIGN.
1422 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1423 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1424 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1426 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1427 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1428 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1429 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1430 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1431 setOperationAction(ISD::SRA, VT, Custom);
1432 setOperationAction(ISD::SRL, VT, Custom);
1433 setOperationAction(ISD::SHL, VT, Custom);
1434 setOperationAction(ISD::OR, VT, Custom);
1435 setOperationAction(ISD::SETCC, VT, Custom);
1436 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
1438 setOperationAction(ISD::SELECT, VT, Expand);
1439 setOperationAction(ISD::SELECT_CC, VT, Expand);
1440 setOperationAction(ISD::VSELECT, VT, Expand);
1441 for (MVT InnerVT : MVT::all_valuetypes())
1442 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1444 // CNT supports only B element sizes, then use UADDLP to widen.
1445 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1446 setOperationAction(ISD::CTPOP, VT, Custom);
1448 setOperationAction(ISD::UDIV, VT, Expand);
1449 setOperationAction(ISD::SDIV, VT, Expand);
1450 setOperationAction(ISD::UREM, VT, Expand);
1451 setOperationAction(ISD::SREM, VT, Expand);
1452 setOperationAction(ISD::FREM, VT, Expand);
1454 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1455 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1457 if (!VT.isFloatingPoint())
1458 setOperationAction(ISD::ABS, VT, Legal);
1460 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1461 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1462 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1463 setOperationAction(Opcode, VT, Legal);
1465 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1466 if (VT.isFloatingPoint() &&
1467 VT.getVectorElementType() != MVT::bf16 &&
1468 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1469 for (unsigned Opcode :
1470 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
1471 setOperationAction(Opcode, VT, Legal);
1473 if (Subtarget->isLittleEndian()) {
1474 for (unsigned im = (unsigned)ISD::PRE_INC;
1475 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1476 setIndexedLoadAction(im, VT, Legal);
1477 setIndexedStoreAction(im, VT, Legal);
1482 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1483 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1485 // By default everything must be expanded.
1486 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1487 setOperationAction(Op, VT, Expand);
1489 // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1490 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1492 if (VT.isFloatingPoint()) {
1493 setCondCodeAction(ISD::SETO, VT, Expand);
1494 setCondCodeAction(ISD::SETOLT, VT, Expand);
1495 setCondCodeAction(ISD::SETLT, VT, Expand);
1496 setCondCodeAction(ISD::SETOLE, VT, Expand);
1497 setCondCodeAction(ISD::SETLE, VT, Expand);
1498 setCondCodeAction(ISD::SETULT, VT, Expand);
1499 setCondCodeAction(ISD::SETULE, VT, Expand);
1500 setCondCodeAction(ISD::SETUGE, VT, Expand);
1501 setCondCodeAction(ISD::SETUGT, VT, Expand);
1502 setCondCodeAction(ISD::SETUEQ, VT, Expand);
1503 setCondCodeAction(ISD::SETUNE, VT, Expand);
1506 // Mark integer truncating stores as having custom lowering
1507 if (VT.isInteger()) {
1508 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1509 while (InnerVT != VT) {
1510 setTruncStoreAction(VT, InnerVT, Custom);
1511 InnerVT = InnerVT.changeVectorElementType(
1512 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1516 // Lower fixed length vector operations to scalable equivalents.
1517 setOperationAction(ISD::ABS, VT, Custom);
1518 setOperationAction(ISD::ADD, VT, Custom);
1519 setOperationAction(ISD::AND, VT, Custom);
1520 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1521 setOperationAction(ISD::BITCAST, VT, Custom);
1522 setOperationAction(ISD::BITREVERSE, VT, Custom);
1523 setOperationAction(ISD::BSWAP, VT, Custom);
1524 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1525 setOperationAction(ISD::CTLZ, VT, Custom);
1526 setOperationAction(ISD::CTPOP, VT, Custom);
1527 setOperationAction(ISD::CTTZ, VT, Custom);
1528 setOperationAction(ISD::FABS, VT, Custom);
1529 setOperationAction(ISD::FADD, VT, Custom);
1530 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1531 setOperationAction(ISD::FCEIL, VT, Custom);
1532 setOperationAction(ISD::FDIV, VT, Custom);
1533 setOperationAction(ISD::FFLOOR, VT, Custom);
1534 setOperationAction(ISD::FMA, VT, Custom);
1535 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1536 setOperationAction(ISD::FMAXNUM, VT, Custom);
1537 setOperationAction(ISD::FMINIMUM, VT, Custom);
1538 setOperationAction(ISD::FMINNUM, VT, Custom);
1539 setOperationAction(ISD::FMUL, VT, Custom);
1540 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1541 setOperationAction(ISD::FNEG, VT, Custom);
1542 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1543 setOperationAction(ISD::FP_ROUND, VT, Custom);
1544 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1545 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1546 setOperationAction(ISD::FRINT, VT, Custom);
1547 setOperationAction(ISD::FROUND, VT, Custom);
1548 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1549 setOperationAction(ISD::FSQRT, VT, Custom);
1550 setOperationAction(ISD::FSUB, VT, Custom);
1551 setOperationAction(ISD::FTRUNC, VT, Custom);
1552 setOperationAction(ISD::LOAD, VT, Custom);
1553 setOperationAction(ISD::MGATHER, VT, Custom);
1554 setOperationAction(ISD::MLOAD, VT, Custom);
1555 setOperationAction(ISD::MSCATTER, VT, Custom);
1556 setOperationAction(ISD::MSTORE, VT, Custom);
1557 setOperationAction(ISD::MUL, VT, Custom);
1558 setOperationAction(ISD::MULHS, VT, Custom);
1559 setOperationAction(ISD::MULHU, VT, Custom);
1560 setOperationAction(ISD::OR, VT, Custom);
1561 setOperationAction(ISD::SDIV, VT, Custom);
1562 setOperationAction(ISD::SELECT, VT, Custom);
1563 setOperationAction(ISD::SETCC, VT, Custom);
1564 setOperationAction(ISD::SHL, VT, Custom);
1565 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1566 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1567 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1568 setOperationAction(ISD::SMAX, VT, Custom);
1569 setOperationAction(ISD::SMIN, VT, Custom);
1570 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1571 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1572 setOperationAction(ISD::SRA, VT, Custom);
1573 setOperationAction(ISD::SRL, VT, Custom);
1574 setOperationAction(ISD::STORE, VT, Custom);
1575 setOperationAction(ISD::SUB, VT, Custom);
1576 setOperationAction(ISD::TRUNCATE, VT, Custom);
1577 setOperationAction(ISD::UDIV, VT, Custom);
1578 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1579 setOperationAction(ISD::UMAX, VT, Custom);
1580 setOperationAction(ISD::UMIN, VT, Custom);
1581 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1582 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1583 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1584 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1585 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1586 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1587 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1588 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1589 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1590 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1591 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1592 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1593 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1594 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1595 setOperationAction(ISD::VSELECT, VT, Custom);
1596 setOperationAction(ISD::XOR, VT, Custom);
1597 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1600 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1601 addRegisterClass(VT, &AArch64::FPR64RegClass);
1605 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1606 addRegisterClass(VT, &AArch64::FPR128RegClass);
1610 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
1611 LLVMContext &C, EVT VT) const {
1614 if (VT.isScalableVector())
1615 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1616 return VT.changeVectorElementTypeToInteger();
1619 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1620 const APInt &Demanded,
1621 TargetLowering::TargetLoweringOpt &TLO,
1623 uint64_t OldImm = Imm, NewImm, Enc;
1624 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1626 // Return if the immediate is already all zeros, all ones, a bimm32 or a
1628 if (Imm == 0 || Imm == Mask ||
1629 AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
1632 unsigned EltSize = Size;
1633 uint64_t DemandedBits = Demanded.getZExtValue();
1635 // Clear bits that are not demanded.
1636 Imm &= DemandedBits;
1639 // The goal here is to set the non-demanded bits in a way that minimizes
1640 // the number of switching between 0 and 1. In order to achieve this goal,
1641 // we set the non-demanded bits to the value of the preceding demanded bits.
1642 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1643 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1644 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1645 // The final result is 0b11000011.
1646 uint64_t NonDemandedBits = ~DemandedBits;
1647 uint64_t InvertedImm = ~Imm & DemandedBits;
1648 uint64_t RotatedImm =
1649 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1651 uint64_t Sum = RotatedImm + NonDemandedBits;
1652 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1653 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1654 NewImm = (Imm | Ones) & Mask;
1656 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1657 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1658 // we halve the element size and continue the search.
1659 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1662 // We cannot shrink the element size any further if it is 2-bits.
1668 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1670 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1671 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1674 // Merge the upper and lower halves of Imm and DemandedBits.
1676 DemandedBits |= DemandedBitsHi;
1681 // Replicate the element across the register width.
1682 while (EltSize < Size) {
1683 NewImm |= NewImm << EltSize;
1688 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1689 "demanded bits should never be altered");
1690 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1692 // Create the new constant immediate node.
1693 EVT VT = Op.getValueType();
1697 // If the new constant immediate is all-zeros or all-ones, let the target
1698 // independent DAG combine optimize this node.
1699 if (NewImm == 0 || NewImm == OrigMask) {
1700 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1701 TLO.DAG.getConstant(NewImm, DL, VT));
1702 // Otherwise, create a machine node so that target independent DAG combine
1703 // doesn't undo this optimization.
1705 Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
1706 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1708 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1711 return TLO.CombineTo(Op, New);
1714 bool AArch64TargetLowering::targetShrinkDemandedConstant(
1715 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1716 TargetLoweringOpt &TLO) const {
1717 // Delay this optimization to as late as possible.
1721 if (!EnableOptimizeLogicalImm)
1724 EVT VT = Op.getValueType();
1728 unsigned Size = VT.getSizeInBits();
1729 assert((Size == 32 || Size == 64) &&
1730 "i32 or i64 is expected after legalization.");
1732 // Exit early if we demand all bits.
1733 if (DemandedBits.countPopulation() == Size)
1737 switch (Op.getOpcode()) {
1741 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1744 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1747 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1750 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1753 uint64_t Imm = C->getZExtValue();
1754 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1757 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1758 /// Mask are known to be either zero or one and return them Known.
1759 void AArch64TargetLowering::computeKnownBitsForTargetNode(
1760 const SDValue Op, KnownBits &Known,
1761 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1762 switch (Op.getOpcode()) {
1765 case AArch64ISD::CSEL: {
1767 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1768 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1769 Known = KnownBits::commonBits(Known, Known2);
1772 case AArch64ISD::LOADgot:
1773 case AArch64ISD::ADDlow: {
1774 if (!Subtarget->isTargetILP32())
1776 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1777 Known.Zero = APInt::getHighBitsSet(64, 32);
1780 case ISD::INTRINSIC_W_CHAIN: {
1781 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1782 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1785 case Intrinsic::aarch64_ldaxr:
1786 case Intrinsic::aarch64_ldxr: {
1787 unsigned BitWidth = Known.getBitWidth();
1788 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1789 unsigned MemBits = VT.getScalarSizeInBits();
1790 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1796 case ISD::INTRINSIC_WO_CHAIN:
1797 case ISD::INTRINSIC_VOID: {
1798 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1802 case Intrinsic::aarch64_neon_umaxv:
1803 case Intrinsic::aarch64_neon_uminv: {
1804 // Figure out the datatype of the vector operand. The UMINV instruction
1805 // will zero extend the result, so we can mark as known zero all the
1806 // bits larger than the element datatype. 32-bit or larget doesn't need
1807 // this as those are legal types and will be handled by isel directly.
1808 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1809 unsigned BitWidth = Known.getBitWidth();
1810 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1811 assert(BitWidth >= 8 && "Unexpected width!");
1812 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1814 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1815 assert(BitWidth >= 16 && "Unexpected width!");
1816 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1826 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
1831 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
1832 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1834 if (Subtarget->requiresStrictAlign())
1838 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1839 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1840 // See comments in performSTORECombine() for more details about
1841 // these conditions.
1843 // Code that uses clang vector extensions can mark that it
1844 // wants unaligned accesses to be treated as fast by
1845 // underspecifying alignment to be 1 or 2.
1848 // Disregard v2i64. Memcpy lowering produces those and splitting
1849 // them regresses performance on micro-benchmarks and olden/bh.
1855 // Same as above but handling LLTs instead.
1856 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
1857 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1859 if (Subtarget->requiresStrictAlign())
1863 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1864 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1865 Ty.getSizeInBytes() != 16 ||
1866 // See comments in performSTORECombine() for more details about
1867 // these conditions.
1869 // Code that uses clang vector extensions can mark that it
1870 // wants unaligned accesses to be treated as fast by
1871 // underspecifying alignment to be 1 or 2.
1874 // Disregard v2i64. Memcpy lowering produces those and splitting
1875 // them regresses performance on micro-benchmarks and olden/bh.
1876 Ty == LLT::fixed_vector(2, 64);
1882 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1883 const TargetLibraryInfo *libInfo) const {
1884 return AArch64::createFastISel(funcInfo, libInfo);
1887 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1888 #define MAKE_CASE(V) \
1891 switch ((AArch64ISD::NodeType)Opcode) {
1892 case AArch64ISD::FIRST_NUMBER:
1894 MAKE_CASE(AArch64ISD::CALL)
1895 MAKE_CASE(AArch64ISD::ADRP)
1896 MAKE_CASE(AArch64ISD::ADR)
1897 MAKE_CASE(AArch64ISD::ADDlow)
1898 MAKE_CASE(AArch64ISD::LOADgot)
1899 MAKE_CASE(AArch64ISD::RET_FLAG)
1900 MAKE_CASE(AArch64ISD::BRCOND)
1901 MAKE_CASE(AArch64ISD::CSEL)
1902 MAKE_CASE(AArch64ISD::CSINV)
1903 MAKE_CASE(AArch64ISD::CSNEG)
1904 MAKE_CASE(AArch64ISD::CSINC)
1905 MAKE_CASE(AArch64ISD::THREAD_POINTER)
1906 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
1907 MAKE_CASE(AArch64ISD::ADD_PRED)
1908 MAKE_CASE(AArch64ISD::MUL_PRED)
1909 MAKE_CASE(AArch64ISD::MULHS_PRED)
1910 MAKE_CASE(AArch64ISD::MULHU_PRED)
1911 MAKE_CASE(AArch64ISD::SDIV_PRED)
1912 MAKE_CASE(AArch64ISD::SHL_PRED)
1913 MAKE_CASE(AArch64ISD::SMAX_PRED)
1914 MAKE_CASE(AArch64ISD::SMIN_PRED)
1915 MAKE_CASE(AArch64ISD::SRA_PRED)
1916 MAKE_CASE(AArch64ISD::SRL_PRED)
1917 MAKE_CASE(AArch64ISD::SUB_PRED)
1918 MAKE_CASE(AArch64ISD::UDIV_PRED)
1919 MAKE_CASE(AArch64ISD::UMAX_PRED)
1920 MAKE_CASE(AArch64ISD::UMIN_PRED)
1921 MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
1922 MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
1923 MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
1924 MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
1925 MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
1926 MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
1927 MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
1928 MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
1929 MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
1930 MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
1931 MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
1932 MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
1933 MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
1934 MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
1935 MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
1936 MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
1937 MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
1938 MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
1939 MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
1940 MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
1941 MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
1942 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
1943 MAKE_CASE(AArch64ISD::ADC)
1944 MAKE_CASE(AArch64ISD::SBC)
1945 MAKE_CASE(AArch64ISD::ADDS)
1946 MAKE_CASE(AArch64ISD::SUBS)
1947 MAKE_CASE(AArch64ISD::ADCS)
1948 MAKE_CASE(AArch64ISD::SBCS)
1949 MAKE_CASE(AArch64ISD::ANDS)
1950 MAKE_CASE(AArch64ISD::CCMP)
1951 MAKE_CASE(AArch64ISD::CCMN)
1952 MAKE_CASE(AArch64ISD::FCCMP)
1953 MAKE_CASE(AArch64ISD::FCMP)
1954 MAKE_CASE(AArch64ISD::STRICT_FCMP)
1955 MAKE_CASE(AArch64ISD::STRICT_FCMPE)
1956 MAKE_CASE(AArch64ISD::DUP)
1957 MAKE_CASE(AArch64ISD::DUPLANE8)
1958 MAKE_CASE(AArch64ISD::DUPLANE16)
1959 MAKE_CASE(AArch64ISD::DUPLANE32)
1960 MAKE_CASE(AArch64ISD::DUPLANE64)
1961 MAKE_CASE(AArch64ISD::MOVI)
1962 MAKE_CASE(AArch64ISD::MOVIshift)
1963 MAKE_CASE(AArch64ISD::MOVIedit)
1964 MAKE_CASE(AArch64ISD::MOVImsl)
1965 MAKE_CASE(AArch64ISD::FMOV)
1966 MAKE_CASE(AArch64ISD::MVNIshift)
1967 MAKE_CASE(AArch64ISD::MVNImsl)
1968 MAKE_CASE(AArch64ISD::BICi)
1969 MAKE_CASE(AArch64ISD::ORRi)
1970 MAKE_CASE(AArch64ISD::BSP)
1971 MAKE_CASE(AArch64ISD::EXTR)
1972 MAKE_CASE(AArch64ISD::ZIP1)
1973 MAKE_CASE(AArch64ISD::ZIP2)
1974 MAKE_CASE(AArch64ISD::UZP1)
1975 MAKE_CASE(AArch64ISD::UZP2)
1976 MAKE_CASE(AArch64ISD::TRN1)
1977 MAKE_CASE(AArch64ISD::TRN2)
1978 MAKE_CASE(AArch64ISD::REV16)
1979 MAKE_CASE(AArch64ISD::REV32)
1980 MAKE_CASE(AArch64ISD::REV64)
1981 MAKE_CASE(AArch64ISD::EXT)
1982 MAKE_CASE(AArch64ISD::SPLICE)
1983 MAKE_CASE(AArch64ISD::VSHL)
1984 MAKE_CASE(AArch64ISD::VLSHR)
1985 MAKE_CASE(AArch64ISD::VASHR)
1986 MAKE_CASE(AArch64ISD::VSLI)
1987 MAKE_CASE(AArch64ISD::VSRI)
1988 MAKE_CASE(AArch64ISD::CMEQ)
1989 MAKE_CASE(AArch64ISD::CMGE)
1990 MAKE_CASE(AArch64ISD::CMGT)
1991 MAKE_CASE(AArch64ISD::CMHI)
1992 MAKE_CASE(AArch64ISD::CMHS)
1993 MAKE_CASE(AArch64ISD::FCMEQ)
1994 MAKE_CASE(AArch64ISD::FCMGE)
1995 MAKE_CASE(AArch64ISD::FCMGT)
1996 MAKE_CASE(AArch64ISD::CMEQz)
1997 MAKE_CASE(AArch64ISD::CMGEz)
1998 MAKE_CASE(AArch64ISD::CMGTz)
1999 MAKE_CASE(AArch64ISD::CMLEz)
2000 MAKE_CASE(AArch64ISD::CMLTz)
2001 MAKE_CASE(AArch64ISD::FCMEQz)
2002 MAKE_CASE(AArch64ISD::FCMGEz)
2003 MAKE_CASE(AArch64ISD::FCMGTz)
2004 MAKE_CASE(AArch64ISD::FCMLEz)
2005 MAKE_CASE(AArch64ISD::FCMLTz)
2006 MAKE_CASE(AArch64ISD::SADDV)
2007 MAKE_CASE(AArch64ISD::UADDV)
2008 MAKE_CASE(AArch64ISD::SRHADD)
2009 MAKE_CASE(AArch64ISD::URHADD)
2010 MAKE_CASE(AArch64ISD::SHADD)
2011 MAKE_CASE(AArch64ISD::UHADD)
2012 MAKE_CASE(AArch64ISD::SDOT)
2013 MAKE_CASE(AArch64ISD::UDOT)
2014 MAKE_CASE(AArch64ISD::SMINV)
2015 MAKE_CASE(AArch64ISD::UMINV)
2016 MAKE_CASE(AArch64ISD::SMAXV)
2017 MAKE_CASE(AArch64ISD::UMAXV)
2018 MAKE_CASE(AArch64ISD::SADDV_PRED)
2019 MAKE_CASE(AArch64ISD::UADDV_PRED)
2020 MAKE_CASE(AArch64ISD::SMAXV_PRED)
2021 MAKE_CASE(AArch64ISD::UMAXV_PRED)
2022 MAKE_CASE(AArch64ISD::SMINV_PRED)
2023 MAKE_CASE(AArch64ISD::UMINV_PRED)
2024 MAKE_CASE(AArch64ISD::ORV_PRED)
2025 MAKE_CASE(AArch64ISD::EORV_PRED)
2026 MAKE_CASE(AArch64ISD::ANDV_PRED)
2027 MAKE_CASE(AArch64ISD::CLASTA_N)
2028 MAKE_CASE(AArch64ISD::CLASTB_N)
2029 MAKE_CASE(AArch64ISD::LASTA)
2030 MAKE_CASE(AArch64ISD::LASTB)
2031 MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2032 MAKE_CASE(AArch64ISD::LS64_BUILD)
2033 MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2034 MAKE_CASE(AArch64ISD::TBL)
2035 MAKE_CASE(AArch64ISD::FADD_PRED)
2036 MAKE_CASE(AArch64ISD::FADDA_PRED)
2037 MAKE_CASE(AArch64ISD::FADDV_PRED)
2038 MAKE_CASE(AArch64ISD::FDIV_PRED)
2039 MAKE_CASE(AArch64ISD::FMA_PRED)
2040 MAKE_CASE(AArch64ISD::FMAX_PRED)
2041 MAKE_CASE(AArch64ISD::FMAXV_PRED)
2042 MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2043 MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2044 MAKE_CASE(AArch64ISD::FMIN_PRED)
2045 MAKE_CASE(AArch64ISD::FMINV_PRED)
2046 MAKE_CASE(AArch64ISD::FMINNM_PRED)
2047 MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2048 MAKE_CASE(AArch64ISD::FMUL_PRED)
2049 MAKE_CASE(AArch64ISD::FSUB_PRED)
2050 MAKE_CASE(AArch64ISD::BIC)
2051 MAKE_CASE(AArch64ISD::BIT)
2052 MAKE_CASE(AArch64ISD::CBZ)
2053 MAKE_CASE(AArch64ISD::CBNZ)
2054 MAKE_CASE(AArch64ISD::TBZ)
2055 MAKE_CASE(AArch64ISD::TBNZ)
2056 MAKE_CASE(AArch64ISD::TC_RETURN)
2057 MAKE_CASE(AArch64ISD::PREFETCH)
2058 MAKE_CASE(AArch64ISD::SITOF)
2059 MAKE_CASE(AArch64ISD::UITOF)
2060 MAKE_CASE(AArch64ISD::NVCAST)
2061 MAKE_CASE(AArch64ISD::MRS)
2062 MAKE_CASE(AArch64ISD::SQSHL_I)
2063 MAKE_CASE(AArch64ISD::UQSHL_I)
2064 MAKE_CASE(AArch64ISD::SRSHR_I)
2065 MAKE_CASE(AArch64ISD::URSHR_I)
2066 MAKE_CASE(AArch64ISD::SQSHLU_I)
2067 MAKE_CASE(AArch64ISD::WrapperLarge)
2068 MAKE_CASE(AArch64ISD::LD2post)
2069 MAKE_CASE(AArch64ISD::LD3post)
2070 MAKE_CASE(AArch64ISD::LD4post)
2071 MAKE_CASE(AArch64ISD::ST2post)
2072 MAKE_CASE(AArch64ISD::ST3post)
2073 MAKE_CASE(AArch64ISD::ST4post)
2074 MAKE_CASE(AArch64ISD::LD1x2post)
2075 MAKE_CASE(AArch64ISD::LD1x3post)
2076 MAKE_CASE(AArch64ISD::LD1x4post)
2077 MAKE_CASE(AArch64ISD::ST1x2post)
2078 MAKE_CASE(AArch64ISD::ST1x3post)
2079 MAKE_CASE(AArch64ISD::ST1x4post)
2080 MAKE_CASE(AArch64ISD::LD1DUPpost)
2081 MAKE_CASE(AArch64ISD::LD2DUPpost)
2082 MAKE_CASE(AArch64ISD::LD3DUPpost)
2083 MAKE_CASE(AArch64ISD::LD4DUPpost)
2084 MAKE_CASE(AArch64ISD::LD1LANEpost)
2085 MAKE_CASE(AArch64ISD::LD2LANEpost)
2086 MAKE_CASE(AArch64ISD::LD3LANEpost)
2087 MAKE_CASE(AArch64ISD::LD4LANEpost)
2088 MAKE_CASE(AArch64ISD::ST2LANEpost)
2089 MAKE_CASE(AArch64ISD::ST3LANEpost)
2090 MAKE_CASE(AArch64ISD::ST4LANEpost)
2091 MAKE_CASE(AArch64ISD::SMULL)
2092 MAKE_CASE(AArch64ISD::UMULL)
2093 MAKE_CASE(AArch64ISD::FRECPE)
2094 MAKE_CASE(AArch64ISD::FRECPS)
2095 MAKE_CASE(AArch64ISD::FRSQRTE)
2096 MAKE_CASE(AArch64ISD::FRSQRTS)
2097 MAKE_CASE(AArch64ISD::STG)
2098 MAKE_CASE(AArch64ISD::STZG)
2099 MAKE_CASE(AArch64ISD::ST2G)
2100 MAKE_CASE(AArch64ISD::STZ2G)
2101 MAKE_CASE(AArch64ISD::SUNPKHI)
2102 MAKE_CASE(AArch64ISD::SUNPKLO)
2103 MAKE_CASE(AArch64ISD::UUNPKHI)
2104 MAKE_CASE(AArch64ISD::UUNPKLO)
2105 MAKE_CASE(AArch64ISD::INSR)
2106 MAKE_CASE(AArch64ISD::PTEST)
2107 MAKE_CASE(AArch64ISD::PTRUE)
2108 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2109 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2110 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2111 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2112 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2113 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2114 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2115 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2116 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2117 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2118 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2119 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2120 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2121 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2122 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2123 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2124 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2125 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2126 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2127 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2128 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2129 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2130 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2131 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2132 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2133 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2134 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2135 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2136 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2137 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2138 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2139 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2140 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2141 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2142 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2143 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2144 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2145 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2146 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2147 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2148 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2149 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2150 MAKE_CASE(AArch64ISD::ST1_PRED)
2151 MAKE_CASE(AArch64ISD::SST1_PRED)
2152 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2153 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2154 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2155 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2156 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2157 MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2158 MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2159 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2160 MAKE_CASE(AArch64ISD::LDP)
2161 MAKE_CASE(AArch64ISD::STP)
2162 MAKE_CASE(AArch64ISD::STNP)
2163 MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2164 MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2165 MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2166 MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2167 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2168 MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2169 MAKE_CASE(AArch64ISD::UADDLP)
2170 MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2177 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2178 MachineBasicBlock *MBB) const {
2179 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2183 // [... previous instrs leading to comparison ...]
2189 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2191 MachineFunction *MF = MBB->getParent();
2192 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2193 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2194 DebugLoc DL = MI.getDebugLoc();
2195 MachineFunction::iterator It = ++MBB->getIterator();
2197 Register DestReg = MI.getOperand(0).getReg();
2198 Register IfTrueReg = MI.getOperand(1).getReg();
2199 Register IfFalseReg = MI.getOperand(2).getReg();
2200 unsigned CondCode = MI.getOperand(3).getImm();
2201 bool NZCVKilled = MI.getOperand(4).isKill();
2203 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2204 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2205 MF->insert(It, TrueBB);
2206 MF->insert(It, EndBB);
2208 // Transfer rest of current basic-block to EndBB
2209 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2211 EndBB->transferSuccessorsAndUpdatePHIs(MBB);
2213 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2214 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2215 MBB->addSuccessor(TrueBB);
2216 MBB->addSuccessor(EndBB);
2218 // TrueBB falls through to the end.
2219 TrueBB->addSuccessor(EndBB);
2222 TrueBB->addLiveIn(AArch64::NZCV);
2223 EndBB->addLiveIn(AArch64::NZCV);
2226 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2232 MI.eraseFromParent();
2236 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2237 MachineInstr &MI, MachineBasicBlock *BB) const {
2238 assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2239 BB->getParent()->getFunction().getPersonalityFn())) &&
2240 "SEH does not use catchret!");
2244 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2245 MachineInstr &MI, MachineBasicBlock *BB) const {
2246 switch (MI.getOpcode()) {
2251 llvm_unreachable("Unexpected instruction for custom inserter!");
2253 case AArch64::F128CSEL:
2254 return EmitF128CSEL(MI, BB);
2256 case TargetOpcode::STACKMAP:
2257 case TargetOpcode::PATCHPOINT:
2258 case TargetOpcode::STATEPOINT:
2259 return emitPatchPoint(MI, BB);
2261 case AArch64::CATCHRET:
2262 return EmitLoweredCatchRet(MI, BB);
2266 //===----------------------------------------------------------------------===//
2267 // AArch64 Lowering private implementation.
2268 //===----------------------------------------------------------------------===//
2270 //===----------------------------------------------------------------------===//
2272 //===----------------------------------------------------------------------===//
2274 // Forward declarations of SVE fixed length lowering helpers
2275 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
2276 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2277 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2278 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
2281 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2282 static bool isZerosVector(const SDNode *N) {
2283 // Look through a bit convert.
2284 while (N->getOpcode() == ISD::BITCAST)
2285 N = N->getOperand(0).getNode();
2287 if (ISD::isConstantSplatVectorAllZeros(N))
2290 if (N->getOpcode() != AArch64ISD::DUP)
2293 auto Opnd0 = N->getOperand(0);
2294 auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
2295 auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
2296 return (CINT && CINT->isNullValue()) || (CFP && CFP->isZero());
2299 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2301 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
2304 llvm_unreachable("Unknown condition code!");
2306 return AArch64CC::NE;
2308 return AArch64CC::EQ;
2310 return AArch64CC::GT;
2312 return AArch64CC::GE;
2314 return AArch64CC::LT;
2316 return AArch64CC::LE;
2318 return AArch64CC::HI;
2320 return AArch64CC::HS;
2322 return AArch64CC::LO;
2324 return AArch64CC::LS;
2328 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2329 static void changeFPCCToAArch64CC(ISD::CondCode CC,
2330 AArch64CC::CondCode &CondCode,
2331 AArch64CC::CondCode &CondCode2) {
2332 CondCode2 = AArch64CC::AL;
2335 llvm_unreachable("Unknown FP condition!");
2338 CondCode = AArch64CC::EQ;
2342 CondCode = AArch64CC::GT;
2346 CondCode = AArch64CC::GE;
2349 CondCode = AArch64CC::MI;
2352 CondCode = AArch64CC::LS;
2355 CondCode = AArch64CC::MI;
2356 CondCode2 = AArch64CC::GT;
2359 CondCode = AArch64CC::VC;
2362 CondCode = AArch64CC::VS;
2365 CondCode = AArch64CC::EQ;
2366 CondCode2 = AArch64CC::VS;
2369 CondCode = AArch64CC::HI;
2372 CondCode = AArch64CC::PL;
2376 CondCode = AArch64CC::LT;
2380 CondCode = AArch64CC::LE;
2384 CondCode = AArch64CC::NE;
2389 /// Convert a DAG fp condition code to an AArch64 CC.
2390 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2391 /// should be AND'ed instead of OR'ed.
2392 static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
2393 AArch64CC::CondCode &CondCode,
2394 AArch64CC::CondCode &CondCode2) {
2395 CondCode2 = AArch64CC::AL;
2398 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2399 assert(CondCode2 == AArch64CC::AL);
2403 // == ((a olt b) || (a ogt b))
2404 // == ((a ord b) && (a une b))
2405 CondCode = AArch64CC::VC;
2406 CondCode2 = AArch64CC::NE;
2410 // == ((a uno b) || (a oeq b))
2411 // == ((a ule b) && (a uge b))
2412 CondCode = AArch64CC::PL;
2413 CondCode2 = AArch64CC::LE;
2418 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2419 /// CC usable with the vector instructions. Fewer operations are available
2420 /// without a real NZCV register, so we have to use less efficient combinations
2421 /// to get the same effect.
2422 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
2423 AArch64CC::CondCode &CondCode,
2424 AArch64CC::CondCode &CondCode2,
2429 // Mostly the scalar mappings work fine.
2430 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2436 CondCode = AArch64CC::MI;
2437 CondCode2 = AArch64CC::GE;
2444 // All of the compare-mask comparisons are ordered, but we can switch
2445 // between the two by a double inversion. E.g. ULE == !OGT.
2447 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2448 CondCode, CondCode2);
2453 static bool isLegalArithImmed(uint64_t C) {
2454 // Matches AArch64DAGToDAGISel::SelectArithImmed().
2455 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2456 LLVM_DEBUG(dbgs() << "Is imm " << C
2457 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2461 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2462 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2463 // can be set differently by this operation. It comes down to whether
2464 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2465 // everything is fine. If not then the optimization is wrong. Thus general
2466 // comparisons are only valid if op2 != 0.
2468 // So, finally, the only LLVM-native comparisons that don't mention C and V
2469 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2470 // the absence of information about op2.
2471 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2472 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2473 (CC == ISD::SETEQ || CC == ISD::SETNE);
2476 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
2477 SelectionDAG &DAG, SDValue Chain,
2479 EVT VT = LHS.getValueType();
2480 assert(VT != MVT::f128);
2481 assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
2483 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
2484 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2487 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2488 const SDLoc &dl, SelectionDAG &DAG) {
2489 EVT VT = LHS.getValueType();
2490 const bool FullFP16 =
2491 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2493 if (VT.isFloatingPoint()) {
2494 assert(VT != MVT::f128);
2495 if (VT == MVT::f16 && !FullFP16) {
2496 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2497 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2500 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2503 // The CMP instruction is just an alias for SUBS, and representing it as
2504 // SUBS means that it's possible to get CSE with subtract operations.
2505 // A later phase can perform the optimization of setting the destination
2506 // register to WZR/XZR if it ends up being unused.
2507 unsigned Opcode = AArch64ISD::SUBS;
2509 if (isCMN(RHS, CC)) {
2510 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2511 Opcode = AArch64ISD::ADDS;
2512 RHS = RHS.getOperand(1);
2513 } else if (isCMN(LHS, CC)) {
2514 // As we are looking for EQ/NE compares, the operands can be commuted ; can
2515 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2516 Opcode = AArch64ISD::ADDS;
2517 LHS = LHS.getOperand(1);
2518 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2519 if (LHS.getOpcode() == ISD::AND) {
2520 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2521 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2522 // of the signed comparisons.
2523 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2524 DAG.getVTList(VT, MVT_CC),
2527 // Replace all users of (and X, Y) with newly generated (ands X, Y)
2528 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2529 return ANDSNode.getValue(1);
2530 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2531 // Use result of ANDS
2532 return LHS.getValue(1);
2536 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2540 /// \defgroup AArch64CCMP CMP;CCMP matching
2542 /// These functions deal with the formation of CMP;CCMP;... sequences.
2543 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2544 /// a comparison. They set the NZCV flags to a predefined value if their
2545 /// predicate is false. This allows to express arbitrary conjunctions, for
2546 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2549 /// ccmp B, inv(CB), CA
2550 /// check for CB flags
2552 /// This naturally lets us implement chains of AND operations with SETCC
2553 /// operands. And we can even implement some other situations by transforming
2555 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2556 /// negating the flags used in a CCMP/FCCMP operations.
2557 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2558 /// by negating the flags we test for afterwards. i.e.
2559 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2560 /// - Note that we can only ever negate all previously processed results.
2561 /// What we can not implement by flipping the flags to test is a negation
2562 /// of two sub-trees (because the negation affects all sub-trees emitted so
2563 /// far, so the 2nd sub-tree we emit would also affect the first).
2564 /// With those tools we can implement some OR operations:
2565 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
2566 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2567 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
2568 /// elimination rules from earlier to implement the whole thing as a
2569 /// CCMP/FCCMP chain.
2571 /// As complete example:
2572 /// or (or (setCA (cmp A)) (setCB (cmp B)))
2573 /// (and (setCC (cmp C)) (setCD (cmp D)))"
2574 /// can be reassociated to:
2575 /// or (and (setCC (cmp C)) setCD (cmp D))
2576 // (or (setCA (cmp A)) (setCB (cmp B)))
2577 /// can be transformed to:
2578 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2579 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2580 /// which can be implemented as:
2582 /// ccmp D, inv(CD), CC
2583 /// ccmp A, CA, inv(CD)
2584 /// ccmp B, CB, inv(CA)
2585 /// check for CB flags
2587 /// A counterexample is "or (and A B) (and C D)" which translates to
2588 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2589 /// can only implement 1 of the inner (not) operations, but not both!
2592 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2593 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
2594 ISD::CondCode CC, SDValue CCOp,
2595 AArch64CC::CondCode Predicate,
2596 AArch64CC::CondCode OutCC,
2597 const SDLoc &DL, SelectionDAG &DAG) {
2598 unsigned Opcode = 0;
2599 const bool FullFP16 =
2600 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2602 if (LHS.getValueType().isFloatingPoint()) {
2603 assert(LHS.getValueType() != MVT::f128);
2604 if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2605 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
2606 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
2608 Opcode = AArch64ISD::FCCMP;
2609 } else if (RHS.getOpcode() == ISD::SUB) {
2610 SDValue SubOp0 = RHS.getOperand(0);
2611 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2612 // See emitComparison() on why we can only do this for SETEQ and SETNE.
2613 Opcode = AArch64ISD::CCMN;
2614 RHS = RHS.getOperand(1);
2618 Opcode = AArch64ISD::CCMP;
2620 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2621 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
2622 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
2623 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
2624 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2627 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2628 /// expressed as a conjunction. See \ref AArch64CCMP.
2629 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
2630 /// changing the conditions on the SETCC tests.
2631 /// (this means we can call emitConjunctionRec() with
2632 /// Negate==true on this sub-tree)
2633 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
2634 /// cannot do the negation naturally. We are required to
2635 /// emit the subtree first in this case.
2636 /// \param WillNegate Is true if are called when the result of this
2637 /// subexpression must be negated. This happens when the
2638 /// outer expression is an OR. We can use this fact to know
2639 /// that we have a double negation (or (or ...) ...) that
2640 /// can be implemented for free.
2641 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2642 bool &MustBeFirst, bool WillNegate,
2643 unsigned Depth = 0) {
2644 if (!Val.hasOneUse())
2646 unsigned Opcode = Val->getOpcode();
2647 if (Opcode == ISD::SETCC) {
2648 if (Val->getOperand(0).getValueType() == MVT::f128)
2651 MustBeFirst = false;
2654 // Protect against exponential runtime and stack overflow.
2657 if (Opcode == ISD::AND || Opcode == ISD::OR) {
2658 bool IsOR = Opcode == ISD::OR;
2659 SDValue O0 = Val->getOperand(0);
2660 SDValue O1 = Val->getOperand(1);
2663 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
2667 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
2670 if (MustBeFirstL && MustBeFirstR)
2674 // For an OR expression we need to be able to naturally negate at least
2675 // one side or we cannot do the transformation at all.
2676 if (!CanNegateL && !CanNegateR)
2678 // If we the result of the OR will be negated and we can naturally negate
2679 // the leafs, then this sub-tree as a whole negates naturally.
2680 CanNegate = WillNegate && CanNegateL && CanNegateR;
2681 // If we cannot naturally negate the whole sub-tree, then this must be
2683 MustBeFirst = !CanNegate;
2685 assert(Opcode == ISD::AND && "Must be OR or AND");
2686 // We cannot naturally negate an AND operation.
2688 MustBeFirst = MustBeFirstL || MustBeFirstR;
2695 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2696 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2697 /// Tries to transform the given i1 producing node @p Val to a series compare
2698 /// and conditional compare operations. @returns an NZCV flags producing node
2699 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2700 /// transformation was not possible.
2701 /// \p Negate is true if we want this sub-tree being negated just by changing
2702 /// SETCC conditions.
2703 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
2704 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2705 AArch64CC::CondCode Predicate) {
2706 // We're at a tree leaf, produce a conditional comparison operation.
2707 unsigned Opcode = Val->getOpcode();
2708 if (Opcode == ISD::SETCC) {
2709 SDValue LHS = Val->getOperand(0);
2710 SDValue RHS = Val->getOperand(1);
2711 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2712 bool isInteger = LHS.getValueType().isInteger();
2714 CC = getSetCCInverse(CC, LHS.getValueType());
2716 // Determine OutCC and handle FP special case.
2718 OutCC = changeIntCCToAArch64CC(CC);
2720 assert(LHS.getValueType().isFloatingPoint());
2721 AArch64CC::CondCode ExtraCC;
2722 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
2723 // Some floating point conditions can't be tested with a single condition
2724 // code. Construct an additional comparison in this case.
2725 if (ExtraCC != AArch64CC::AL) {
2727 if (!CCOp.getNode())
2728 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2730 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2733 Predicate = ExtraCC;
2737 // Produce a normal comparison if we are first in the chain
2739 return emitComparison(LHS, RHS, CC, DL, DAG);
2740 // Otherwise produce a ccmp.
2741 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2744 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2746 bool IsOR = Opcode == ISD::OR;
2748 SDValue LHS = Val->getOperand(0);
2751 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
2752 assert(ValidL && "Valid conjunction/disjunction tree");
2755 SDValue RHS = Val->getOperand(1);
2758 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
2759 assert(ValidR && "Valid conjunction/disjunction tree");
2762 // Swap sub-tree that must come first to the right side.
2764 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2765 std::swap(LHS, RHS);
2766 std::swap(CanNegateL, CanNegateR);
2767 std::swap(MustBeFirstL, MustBeFirstR);
2773 bool NegateAfterAll;
2774 if (Opcode == ISD::OR) {
2775 // Swap the sub-tree that we can negate naturally to the left.
2777 assert(CanNegateR && "at least one side must be negatable");
2778 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2780 std::swap(LHS, RHS);
2782 NegateAfterR = true;
2784 // Negate the left sub-tree if possible, otherwise negate the result.
2785 NegateR = CanNegateR;
2786 NegateAfterR = !CanNegateR;
2789 NegateAfterAll = !Negate;
2791 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2792 assert(!Negate && "Valid conjunction/disjunction tree");
2796 NegateAfterR = false;
2797 NegateAfterAll = false;
2801 AArch64CC::CondCode RHSCC;
2802 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2804 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
2805 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
2807 OutCC = AArch64CC::getInvertedCondCode(OutCC);
2811 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2812 /// In some cases this is even possible with OR operations in the expression.
2813 /// See \ref AArch64CCMP.
2814 /// \see emitConjunctionRec().
2815 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
2816 AArch64CC::CondCode &OutCC) {
2817 bool DummyCanNegate;
2818 bool DummyMustBeFirst;
2819 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
2822 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2827 /// Returns how profitable it is to fold a comparison's operand's shift and/or
2828 /// extension operations.
2829 static unsigned getCmpOperandFoldingProfit(SDValue Op) {
2830 auto isSupportedExtend = [&](SDValue V) {
2831 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2834 if (V.getOpcode() == ISD::AND)
2835 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2836 uint64_t Mask = MaskCst->getZExtValue();
2837 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2843 if (!Op.hasOneUse())
2846 if (isSupportedExtend(Op))
2849 unsigned Opc = Op.getOpcode();
2850 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2851 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2852 uint64_t Shift = ShiftCst->getZExtValue();
2853 if (isSupportedExtend(Op.getOperand(0)))
2854 return (Shift <= 4) ? 2 : 1;
2855 EVT VT = Op.getValueType();
2856 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2863 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2864 SDValue &AArch64cc, SelectionDAG &DAG,
2866 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2867 EVT VT = RHS.getValueType();
2868 uint64_t C = RHSC->getZExtValue();
2869 if (!isLegalArithImmed(C)) {
2870 // Constant does not fit, try adjusting it by one?
2876 if ((VT == MVT::i32 && C != 0x80000000 &&
2877 isLegalArithImmed((uint32_t)(C - 1))) ||
2878 (VT == MVT::i64 && C != 0x80000000ULL &&
2879 isLegalArithImmed(C - 1ULL))) {
2880 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2881 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2882 RHS = DAG.getConstant(C, dl, VT);
2887 if ((VT == MVT::i32 && C != 0 &&
2888 isLegalArithImmed((uint32_t)(C - 1))) ||
2889 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2890 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2891 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2892 RHS = DAG.getConstant(C, dl, VT);
2897 if ((VT == MVT::i32 && C != INT32_MAX &&
2898 isLegalArithImmed((uint32_t)(C + 1))) ||
2899 (VT == MVT::i64 && C != INT64_MAX &&
2900 isLegalArithImmed(C + 1ULL))) {
2901 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2902 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2903 RHS = DAG.getConstant(C, dl, VT);
2908 if ((VT == MVT::i32 && C != UINT32_MAX &&
2909 isLegalArithImmed((uint32_t)(C + 1))) ||
2910 (VT == MVT::i64 && C != UINT64_MAX &&
2911 isLegalArithImmed(C + 1ULL))) {
2912 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2913 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2914 RHS = DAG.getConstant(C, dl, VT);
2921 // Comparisons are canonicalized so that the RHS operand is simpler than the
2922 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2923 // can fold some shift+extend operations on the RHS operand, so swap the
2924 // operands if that can be done.
2929 // can be turned into:
2930 // cmp w12, w11, lsl #1
2931 if (!isa<ConstantSDNode>(RHS) ||
2932 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2933 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2935 if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
2936 std::swap(LHS, RHS);
2937 CC = ISD::getSetCCSwappedOperands(CC);
2942 AArch64CC::CondCode AArch64CC;
2943 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2944 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
2946 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2947 // For the i8 operand, the largest immediate is 255, so this can be easily
2948 // encoded in the compare instruction. For the i16 operand, however, the
2949 // largest immediate cannot be encoded in the compare.
2950 // Therefore, use a sign extending load and cmn to avoid materializing the
2951 // -1 constant. For example,
2953 // ldrh w0, [x0, #0]
2956 // ldrsh w0, [x0, #0]
2958 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2959 // if and only if (sext LHS) == (sext RHS). The checks are in place to
2960 // ensure both the LHS and RHS are truly zero extended and to make sure the
2961 // transformation is profitable.
2962 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2963 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2964 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2965 LHS.getNode()->hasNUsesOfValue(1, 0)) {
2966 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2967 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
2969 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2970 DAG.getValueType(MVT::i16));
2971 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2972 RHS.getValueType()),
2974 AArch64CC = changeIntCCToAArch64CC(CC);
2978 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2979 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2980 if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2981 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2987 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2988 AArch64CC = changeIntCCToAArch64CC(CC);
2990 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2994 static std::pair<SDValue, SDValue>
2995 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
2996 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2997 "Unsupported value type");
2998 SDValue Value, Overflow;
3000 SDValue LHS = Op.getOperand(0);
3001 SDValue RHS = Op.getOperand(1);
3003 switch (Op.getOpcode()) {
3005 llvm_unreachable("Unknown overflow instruction!");
3007 Opc = AArch64ISD::ADDS;
3011 Opc = AArch64ISD::ADDS;
3015 Opc = AArch64ISD::SUBS;
3019 Opc = AArch64ISD::SUBS;
3022 // Multiply needs a little bit extra work.
3026 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3027 if (Op.getValueType() == MVT::i32) {
3028 // Extend to 64-bits, then perform a 64-bit multiply.
3029 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3030 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3031 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3032 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3033 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3035 // Check that the result fits into a 32-bit integer.
3036 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3038 // cmp xreg, wreg, sxtw
3039 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3041 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3043 // tst xreg, #0xffffffff00000000
3044 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3046 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3050 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3051 // For the 64 bit multiply
3052 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3054 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3055 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3056 DAG.getConstant(63, DL, MVT::i64));
3057 // It is important that LowerBits is last, otherwise the arithmetic
3058 // shift will not be folded into the compare (SUBS).
3059 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3060 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3063 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3064 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3066 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3067 DAG.getConstant(0, DL, MVT::i64),
3068 UpperBits).getValue(1);
3075 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3077 // Emit the AArch64 operation with overflow check.
3078 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3079 Overflow = Value.getValue(1);
3081 return std::make_pair(Value, Overflow);
3084 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3085 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3086 return LowerToScalableOp(Op, DAG);
3088 SDValue Sel = Op.getOperand(0);
3089 SDValue Other = Op.getOperand(1);
3092 // If the operand is an overflow checking operation, invert the condition
3093 // code and kill the Not operation. I.e., transform:
3094 // (xor (overflow_op_bool, 1))
3096 // (csel 1, 0, invert(cc), overflow_op_bool)
3097 // ... which later gets transformed to just a cset instruction with an
3098 // inverted condition code, rather than a cset + eor sequence.
3099 if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3100 // Only lower legal XALUO ops.
3101 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3104 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3105 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3106 AArch64CC::CondCode CC;
3107 SDValue Value, Overflow;
3108 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3109 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3110 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3113 // If neither operand is a SELECT_CC, give up.
3114 if (Sel.getOpcode() != ISD::SELECT_CC)
3115 std::swap(Sel, Other);
3116 if (Sel.getOpcode() != ISD::SELECT_CC)
3119 // The folding we want to perform is:
3120 // (xor x, (select_cc a, b, cc, 0, -1) )
3122 // (csel x, (xor x, -1), cc ...)
3124 // The latter will get matched to a CSINV instruction.
3126 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3127 SDValue LHS = Sel.getOperand(0);
3128 SDValue RHS = Sel.getOperand(1);
3129 SDValue TVal = Sel.getOperand(2);
3130 SDValue FVal = Sel.getOperand(3);
3132 // FIXME: This could be generalized to non-integer comparisons.
3133 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3136 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3137 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3139 // The values aren't constants, this isn't the pattern we're looking for.
3140 if (!CFVal || !CTVal)
3143 // We can commute the SELECT_CC by inverting the condition. This
3144 // might be needed to make this fit into a CSINV pattern.
3145 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
3146 std::swap(TVal, FVal);
3147 std::swap(CTVal, CFVal);
3148 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3151 // If the constants line up, perform the transform!
3152 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
3154 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3157 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3158 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3160 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3167 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
3168 EVT VT = Op.getValueType();
3170 // Let legalize expand this if it isn't a legal type yet.
3171 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3174 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
3177 bool ExtraOp = false;
3178 switch (Op.getOpcode()) {
3180 llvm_unreachable("Invalid code");
3182 Opc = AArch64ISD::ADDS;
3185 Opc = AArch64ISD::SUBS;
3188 Opc = AArch64ISD::ADCS;
3192 Opc = AArch64ISD::SBCS;
3198 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
3199 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
3203 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3204 // Let legalize expand this if it isn't a legal type yet.
3205 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3209 AArch64CC::CondCode CC;
3210 // The actual operation that sets the overflow or carry flag.
3211 SDValue Value, Overflow;
3212 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3214 // We use 0 and 1 as false and true values.
3215 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3216 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3218 // We use an inverted condition, because the conditional select is inverted
3219 // too. This will allow it to be selected to a single instruction:
3220 // CSINC Wd, WZR, WZR, invert(cond).
3221 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3222 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3225 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3226 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3229 // Prefetch operands are:
3230 // 1: Address to prefetch
3232 // 3: int locality (0 = no locality ... 3 = extreme locality)
3233 // 4: bool isDataCache
3234 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3236 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3237 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3238 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3240 bool IsStream = !Locality;
3241 // When the locality number is set
3243 // The front-end should have filtered out the out-of-range values
3244 assert(Locality <= 3 && "Prefetch locality out-of-range");
3245 // The locality degree is the opposite of the cache speed.
3246 // Put the number the other way around.
3247 // The encoding starts at 0 for level 1
3248 Locality = 3 - Locality;
3251 // built the mask value encoding the expected behavior.
3252 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
3253 (!IsData << 3) | // IsDataCache bit
3254 (Locality << 1) | // Cache level bits
3255 (unsigned)IsStream; // Stream bit
3256 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3257 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
3260 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3261 SelectionDAG &DAG) const {
3262 EVT VT = Op.getValueType();
3263 if (VT.isScalableVector())
3264 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3266 if (useSVEForFixedLengthVectorVT(VT))
3267 return LowerFixedLengthFPExtendToSVE(Op, DAG);
3269 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3273 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3274 SelectionDAG &DAG) const {
3275 if (Op.getValueType().isScalableVector())
3276 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3278 bool IsStrict = Op->isStrictFPOpcode();
3279 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3280 EVT SrcVT = SrcVal.getValueType();
3282 if (useSVEForFixedLengthVectorVT(SrcVT))
3283 return LowerFixedLengthFPRoundToSVE(Op, DAG);
3285 if (SrcVT != MVT::f128) {
3286 // Expand cases where the input is a vector bigger than NEON.
3287 if (useSVEForFixedLengthVectorVT(SrcVT))
3290 // It's legal except when f128 is involved
3297 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3298 SelectionDAG &DAG) const {
3299 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3300 // Any additional optimization in this function should be recorded
3301 // in the cost tables.
3302 EVT InVT = Op.getOperand(0).getValueType();
3303 EVT VT = Op.getValueType();
3305 if (VT.isScalableVector()) {
3306 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3307 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
3308 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
3309 return LowerToPredicatedOp(Op, DAG, Opcode);
3312 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3313 return LowerFixedLengthFPToIntToSVE(Op, DAG);
3315 unsigned NumElts = InVT.getVectorNumElements();
3317 // f16 conversions are promoted to f32 when full fp16 is not supported.
3318 if (InVT.getVectorElementType() == MVT::f16 &&
3319 !Subtarget->hasFullFP16()) {
3320 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
3323 Op.getOpcode(), dl, Op.getValueType(),
3324 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3327 uint64_t VTSize = VT.getFixedSizeInBits();
3328 uint64_t InVTSize = InVT.getFixedSizeInBits();
3329 if (VTSize < InVTSize) {
3332 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3334 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3337 if (VTSize > InVTSize) {
3340 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
3341 VT.getVectorNumElements());
3342 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3343 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3346 // Type changing conversions are illegal.
3350 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3351 SelectionDAG &DAG) const {
3352 bool IsStrict = Op->isStrictFPOpcode();
3353 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3355 if (SrcVal.getValueType().isVector())
3356 return LowerVectorFP_TO_INT(Op, DAG);
3358 // f16 conversions are promoted to f32 when full fp16 is not supported.
3359 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3360 assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3363 Op.getOpcode(), dl, Op.getValueType(),
3364 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
3367 if (SrcVal.getValueType() != MVT::f128) {
3368 // It's legal except when f128 is involved
3375 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
3376 SelectionDAG &DAG) const {
3377 // AArch64 FP-to-int conversions saturate to the destination register size, so
3378 // we can lower common saturating conversions to simple instructions.
3379 SDValue SrcVal = Op.getOperand(0);
3381 EVT SrcVT = SrcVal.getValueType();
3382 EVT DstVT = Op.getValueType();
3384 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3385 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3386 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3387 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3389 // TODO: Support lowering of NEON and SVE conversions.
3390 if (SrcVT.isVector())
3393 // TODO: Saturate to SatWidth explicitly.
3394 if (SatWidth != DstWidth)
3397 // In the absence of FP16 support, promote f32 to f16, like LowerFP_TO_INT().
3398 if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16())
3399 return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
3400 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal),
3403 // Cases that we can emit directly.
3404 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
3405 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
3406 (DstVT == MVT::i64 || DstVT == MVT::i32))
3409 // For all other cases, fall back on the expanded form.
3413 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
3414 SelectionDAG &DAG) const {
3415 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3416 // Any additional optimization in this function should be recorded
3417 // in the cost tables.
3418 EVT VT = Op.getValueType();
3420 SDValue In = Op.getOperand(0);
3421 EVT InVT = In.getValueType();
3422 unsigned Opc = Op.getOpcode();
3423 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
3425 if (VT.isScalableVector()) {
3426 if (InVT.getVectorElementType() == MVT::i1) {
3427 // We can't directly extend an SVE predicate; extend it first.
3428 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3429 EVT CastVT = getPromotedVTForPredicate(InVT);
3430 In = DAG.getNode(CastOpc, dl, CastVT, In);
3431 return DAG.getNode(Opc, dl, VT, In);
3434 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
3435 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
3436 return LowerToPredicatedOp(Op, DAG, Opcode);
3439 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3440 return LowerFixedLengthIntToFPToSVE(Op, DAG);
3442 uint64_t VTSize = VT.getFixedSizeInBits();
3443 uint64_t InVTSize = InVT.getFixedSizeInBits();
3444 if (VTSize < InVTSize) {
3446 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
3447 InVT.getVectorNumElements());
3448 In = DAG.getNode(Opc, dl, CastVT, In);
3449 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
3452 if (VTSize > InVTSize) {
3453 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3454 EVT CastVT = VT.changeVectorElementTypeToInteger();
3455 In = DAG.getNode(CastOpc, dl, CastVT, In);
3456 return DAG.getNode(Opc, dl, VT, In);
3462 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
3463 SelectionDAG &DAG) const {
3464 if (Op.getValueType().isVector())
3465 return LowerVectorINT_TO_FP(Op, DAG);
3467 bool IsStrict = Op->isStrictFPOpcode();
3468 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3470 // f16 conversions are promoted to f32 when full fp16 is not supported.
3471 if (Op.getValueType() == MVT::f16 &&
3472 !Subtarget->hasFullFP16()) {
3473 assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3476 ISD::FP_ROUND, dl, MVT::f16,
3477 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
3478 DAG.getIntPtrConstant(0, dl));
3481 // i128 conversions are libcalls.
3482 if (SrcVal.getValueType() == MVT::i128)
3485 // Other conversions are legal, unless it's to the completely software-based
3487 if (Op.getValueType() != MVT::f128)
3492 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
3493 SelectionDAG &DAG) const {
3494 // For iOS, we want to call an alternative entry point: __sincos_stret,
3495 // which returns the values in two S / D registers.
3497 SDValue Arg = Op.getOperand(0);
3498 EVT ArgVT = Arg.getValueType();
3499 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
3506 Entry.IsSExt = false;
3507 Entry.IsZExt = false;
3508 Args.push_back(Entry);
3510 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
3511 : RTLIB::SINCOS_STRET_F32;
3512 const char *LibcallName = getLibcallName(LC);
3514 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
3516 StructType *RetTy = StructType::get(ArgTy, ArgTy);
3517 TargetLowering::CallLoweringInfo CLI(DAG);
3519 .setChain(DAG.getEntryNode())
3520 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
3522 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3523 return CallResult.first;
3526 static MVT getSVEContainerType(EVT ContentTy);
3528 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
3529 SelectionDAG &DAG) const {
3530 EVT OpVT = Op.getValueType();
3531 EVT ArgVT = Op.getOperand(0).getValueType();
3533 if (useSVEForFixedLengthVectorVT(OpVT))
3534 return LowerFixedLengthBitcastToSVE(Op, DAG);
3536 if (OpVT.isScalableVector()) {
3537 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
3538 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
3539 "Expected int->fp bitcast!");
3541 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
3543 return getSVESafeBitCast(OpVT, ExtResult, DAG);
3545 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
3548 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
3551 assert(ArgVT == MVT::i16);
3554 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
3555 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
3557 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
3558 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
3562 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
3563 if (OrigVT.getSizeInBits() >= 64)
3566 assert(OrigVT.isSimple() && "Expecting a simple value type");
3568 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
3569 switch (OrigSimpleTy) {
3570 default: llvm_unreachable("Unexpected Vector Type");
3579 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
3582 unsigned ExtOpcode) {
3583 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
3584 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
3585 // 64-bits we need to insert a new extension so that it will be 64-bits.
3586 assert(ExtTy.is128BitVector() && "Unexpected extension size");
3587 if (OrigTy.getSizeInBits() >= 64)
3590 // Must extend size to at least 64 bits to be used as an operand for VMULL.
3591 EVT NewVT = getExtensionTo64Bits(OrigTy);
3593 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
3596 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
3598 EVT VT = N->getValueType(0);
3600 if (N->getOpcode() != ISD::BUILD_VECTOR)
3603 for (const SDValue &Elt : N->op_values()) {
3604 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
3605 unsigned EltSize = VT.getScalarSizeInBits();
3606 unsigned HalfSize = EltSize / 2;
3608 if (!isIntN(HalfSize, C->getSExtValue()))
3611 if (!isUIntN(HalfSize, C->getZExtValue()))
3622 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
3623 if (N->getOpcode() == ISD::SIGN_EXTEND ||
3624 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
3625 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
3626 N->getOperand(0)->getValueType(0),
3630 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
3631 EVT VT = N->getValueType(0);
3633 unsigned EltSize = VT.getScalarSizeInBits() / 2;
3634 unsigned NumElts = VT.getVectorNumElements();
3635 MVT TruncVT = MVT::getIntegerVT(EltSize);
3636 SmallVector<SDValue, 8> Ops;
3637 for (unsigned i = 0; i != NumElts; ++i) {
3638 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
3639 const APInt &CInt = C->getAPIntValue();
3640 // Element types smaller than 32 bits are not legal, so use i32 elements.
3641 // The values are implicitly truncated so sext vs. zext doesn't matter.
3642 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
3644 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
3647 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
3648 return N->getOpcode() == ISD::SIGN_EXTEND ||
3649 N->getOpcode() == ISD::ANY_EXTEND ||
3650 isExtendedBUILD_VECTOR(N, DAG, true);
3653 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
3654 return N->getOpcode() == ISD::ZERO_EXTEND ||
3655 N->getOpcode() == ISD::ANY_EXTEND ||
3656 isExtendedBUILD_VECTOR(N, DAG, false);
3659 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
3660 unsigned Opcode = N->getOpcode();
3661 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3662 SDNode *N0 = N->getOperand(0).getNode();
3663 SDNode *N1 = N->getOperand(1).getNode();
3664 return N0->hasOneUse() && N1->hasOneUse() &&
3665 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
3670 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
3671 unsigned Opcode = N->getOpcode();
3672 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3673 SDNode *N0 = N->getOperand(0).getNode();
3674 SDNode *N1 = N->getOperand(1).getNode();
3675 return N0->hasOneUse() && N1->hasOneUse() &&
3676 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
3681 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3682 SelectionDAG &DAG) const {
3683 // The rounding mode is in bits 23:22 of the FPSCR.
3684 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3685 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3686 // so that the shift + and get folded into a bitfield extract.
3689 SDValue Chain = Op.getOperand(0);
3690 SDValue FPCR_64 = DAG.getNode(
3691 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
3692 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
3693 Chain = FPCR_64.getValue(1);
3694 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
3695 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
3696 DAG.getConstant(1U << 22, dl, MVT::i32));
3697 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
3698 DAG.getConstant(22, dl, MVT::i32));
3699 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
3700 DAG.getConstant(3, dl, MVT::i32));
3701 return DAG.getMergeValues({AND, Chain}, dl);
3704 SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
3705 SelectionDAG &DAG) const {
3707 SDValue Chain = Op->getOperand(0);
3708 SDValue RMValue = Op->getOperand(1);
3710 // The rounding mode is in bits 23:22 of the FPCR.
3711 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
3712 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
3713 // ((arg - 1) & 3) << 22).
3715 // The argument of llvm.set.rounding must be within the segment [0, 3], so
3716 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
3717 // generated llvm.set.rounding to ensure this condition.
3719 // Calculate new value of FPCR[23:22].
3720 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
3721 DAG.getConstant(1, DL, MVT::i32));
3722 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
3723 DAG.getConstant(0x3, DL, MVT::i32));
3725 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
3726 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
3727 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
3729 // Get current value of FPCR.
3731 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
3733 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
3734 Chain = FPCR.getValue(1);
3735 FPCR = FPCR.getValue(0);
3737 // Put new rounding mode into FPSCR[23:22].
3738 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
3739 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
3740 DAG.getConstant(RMMask, DL, MVT::i64));
3741 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
3743 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
3745 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
3748 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
3749 EVT VT = Op.getValueType();
3751 // If SVE is available then i64 vector multiplications can also be made legal.
3752 bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
3754 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
3755 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
3757 // Multiplications are only custom-lowered for 128-bit vectors so that
3758 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
3759 assert(VT.is128BitVector() && VT.isInteger() &&
3760 "unexpected type for custom-lowering ISD::MUL");
3761 SDNode *N0 = Op.getOperand(0).getNode();
3762 SDNode *N1 = Op.getOperand(1).getNode();
3763 unsigned NewOpc = 0;
3765 bool isN0SExt = isSignExtended(N0, DAG);
3766 bool isN1SExt = isSignExtended(N1, DAG);
3767 if (isN0SExt && isN1SExt)
3768 NewOpc = AArch64ISD::SMULL;
3770 bool isN0ZExt = isZeroExtended(N0, DAG);
3771 bool isN1ZExt = isZeroExtended(N1, DAG);
3772 if (isN0ZExt && isN1ZExt)
3773 NewOpc = AArch64ISD::UMULL;
3774 else if (isN1SExt || isN1ZExt) {
3775 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
3776 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
3777 if (isN1SExt && isAddSubSExt(N0, DAG)) {
3778 NewOpc = AArch64ISD::SMULL;
3780 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
3781 NewOpc = AArch64ISD::UMULL;
3783 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
3785 NewOpc = AArch64ISD::UMULL;
3791 if (VT == MVT::v2i64)
3792 // Fall through to expand this. It is not legal.
3795 // Other vector multiplications are legal.
3800 // Legalize to a S/UMULL instruction
3803 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
3805 Op0 = skipExtensionForVectorMULL(N0, DAG);
3806 assert(Op0.getValueType().is64BitVector() &&
3807 Op1.getValueType().is64BitVector() &&
3808 "unexpected types for extended operands to VMULL");
3809 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
3811 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
3812 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
3813 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
3814 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
3815 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
3816 EVT Op1VT = Op1.getValueType();
3817 return DAG.getNode(N0->getOpcode(), DL, VT,
3818 DAG.getNode(NewOpc, DL, VT,
3819 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
3820 DAG.getNode(NewOpc, DL, VT,
3821 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
3824 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
3826 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
3827 DAG.getTargetConstant(Pattern, DL, MVT::i32));
3830 static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) {
3832 EVT OutVT = Op.getValueType();
3833 SDValue InOp = Op.getOperand(1);
3834 EVT InVT = InOp.getValueType();
3836 // Return the operand if the cast isn't changing type,
3837 // i.e. <n x 16 x i1> -> <n x 16 x i1>
3841 SDValue Reinterpret =
3842 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, InOp);
3844 // If the argument converted to an svbool is a ptrue or a comparison, the
3845 // lanes introduced by the widening are zero by construction.
3846 switch (InOp.getOpcode()) {
3847 case AArch64ISD::SETCC_MERGE_ZERO:
3849 case ISD::INTRINSIC_WO_CHAIN:
3850 if (InOp.getConstantOperandVal(0) == Intrinsic::aarch64_sve_ptrue)
3854 // Otherwise, zero the newly introduced lanes.
3855 SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all);
3856 SDValue MaskReinterpret =
3857 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, Mask);
3858 return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret);
3861 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
3862 SelectionDAG &DAG) const {
3863 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3866 default: return SDValue(); // Don't custom lower most intrinsics.
3867 case Intrinsic::thread_pointer: {
3868 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3869 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
3871 case Intrinsic::aarch64_neon_abs: {
3872 EVT Ty = Op.getValueType();
3873 if (Ty == MVT::i64) {
3874 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
3876 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
3877 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
3878 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
3879 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
3881 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
3884 case Intrinsic::aarch64_neon_smax:
3885 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
3886 Op.getOperand(1), Op.getOperand(2));
3887 case Intrinsic::aarch64_neon_umax:
3888 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
3889 Op.getOperand(1), Op.getOperand(2));
3890 case Intrinsic::aarch64_neon_smin:
3891 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
3892 Op.getOperand(1), Op.getOperand(2));
3893 case Intrinsic::aarch64_neon_umin:
3894 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
3895 Op.getOperand(1), Op.getOperand(2));
3897 case Intrinsic::aarch64_sve_sunpkhi:
3898 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
3900 case Intrinsic::aarch64_sve_sunpklo:
3901 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
3903 case Intrinsic::aarch64_sve_uunpkhi:
3904 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
3906 case Intrinsic::aarch64_sve_uunpklo:
3907 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
3909 case Intrinsic::aarch64_sve_clasta_n:
3910 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
3911 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3912 case Intrinsic::aarch64_sve_clastb_n:
3913 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
3914 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3915 case Intrinsic::aarch64_sve_lasta:
3916 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
3917 Op.getOperand(1), Op.getOperand(2));
3918 case Intrinsic::aarch64_sve_lastb:
3919 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
3920 Op.getOperand(1), Op.getOperand(2));
3921 case Intrinsic::aarch64_sve_rev:
3922 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
3924 case Intrinsic::aarch64_sve_tbl:
3925 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
3926 Op.getOperand(1), Op.getOperand(2));
3927 case Intrinsic::aarch64_sve_trn1:
3928 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
3929 Op.getOperand(1), Op.getOperand(2));
3930 case Intrinsic::aarch64_sve_trn2:
3931 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
3932 Op.getOperand(1), Op.getOperand(2));
3933 case Intrinsic::aarch64_sve_uzp1:
3934 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
3935 Op.getOperand(1), Op.getOperand(2));
3936 case Intrinsic::aarch64_sve_uzp2:
3937 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
3938 Op.getOperand(1), Op.getOperand(2));
3939 case Intrinsic::aarch64_sve_zip1:
3940 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
3941 Op.getOperand(1), Op.getOperand(2));
3942 case Intrinsic::aarch64_sve_zip2:
3943 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
3944 Op.getOperand(1), Op.getOperand(2));
3945 case Intrinsic::aarch64_sve_splice:
3946 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
3947 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3948 case Intrinsic::aarch64_sve_ptrue:
3949 return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
3951 case Intrinsic::aarch64_sve_clz:
3952 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
3953 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3954 case Intrinsic::aarch64_sve_cnt: {
3955 SDValue Data = Op.getOperand(3);
3956 // CTPOP only supports integer operands.
3957 if (Data.getValueType().isFloatingPoint())
3958 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
3959 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
3960 Op.getOperand(2), Data, Op.getOperand(1));
3962 case Intrinsic::aarch64_sve_dupq_lane:
3963 return LowerDUPQLane(Op, DAG);
3964 case Intrinsic::aarch64_sve_convert_from_svbool:
3965 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
3967 case Intrinsic::aarch64_sve_convert_to_svbool:
3968 return lowerConvertToSVBool(Op, DAG);
3969 case Intrinsic::aarch64_sve_fneg:
3970 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
3971 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3972 case Intrinsic::aarch64_sve_frintp:
3973 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
3974 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3975 case Intrinsic::aarch64_sve_frintm:
3976 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
3977 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3978 case Intrinsic::aarch64_sve_frinti:
3979 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3980 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3981 case Intrinsic::aarch64_sve_frintx:
3982 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3983 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3984 case Intrinsic::aarch64_sve_frinta:
3985 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
3986 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3987 case Intrinsic::aarch64_sve_frintn:
3988 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
3989 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3990 case Intrinsic::aarch64_sve_frintz:
3991 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
3992 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3993 case Intrinsic::aarch64_sve_ucvtf:
3994 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
3995 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3997 case Intrinsic::aarch64_sve_scvtf:
3998 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
3999 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4001 case Intrinsic::aarch64_sve_fcvtzu:
4002 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
4003 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4005 case Intrinsic::aarch64_sve_fcvtzs:
4006 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
4007 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4009 case Intrinsic::aarch64_sve_fsqrt:
4010 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
4011 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4012 case Intrinsic::aarch64_sve_frecpx:
4013 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
4014 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4015 case Intrinsic::aarch64_sve_fabs:
4016 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4017 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4018 case Intrinsic::aarch64_sve_abs:
4019 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4020 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4021 case Intrinsic::aarch64_sve_neg:
4022 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
4023 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4024 case Intrinsic::aarch64_sve_insr: {
4025 SDValue Scalar = Op.getOperand(2);
4026 EVT ScalarTy = Scalar.getValueType();
4027 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
4028 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
4030 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
4031 Op.getOperand(1), Scalar);
4033 case Intrinsic::aarch64_sve_rbit:
4034 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
4035 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4037 case Intrinsic::aarch64_sve_revb:
4038 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
4039 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4040 case Intrinsic::aarch64_sve_sxtb:
4042 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4043 Op.getOperand(2), Op.getOperand(3),
4044 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4046 case Intrinsic::aarch64_sve_sxth:
4048 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4049 Op.getOperand(2), Op.getOperand(3),
4050 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4052 case Intrinsic::aarch64_sve_sxtw:
4054 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4055 Op.getOperand(2), Op.getOperand(3),
4056 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4058 case Intrinsic::aarch64_sve_uxtb:
4060 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4061 Op.getOperand(2), Op.getOperand(3),
4062 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4064 case Intrinsic::aarch64_sve_uxth:
4066 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4067 Op.getOperand(2), Op.getOperand(3),
4068 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4070 case Intrinsic::aarch64_sve_uxtw:
4072 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4073 Op.getOperand(2), Op.getOperand(3),
4074 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4077 case Intrinsic::localaddress: {
4078 const auto &MF = DAG.getMachineFunction();
4079 const auto *RegInfo = Subtarget->getRegisterInfo();
4080 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
4081 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
4082 Op.getSimpleValueType());
4085 case Intrinsic::eh_recoverfp: {
4086 // FIXME: This needs to be implemented to correctly handle highly aligned
4087 // stack objects. For now we simply return the incoming FP. Refer D53541
4088 // for more details.
4089 SDValue FnOp = Op.getOperand(1);
4090 SDValue IncomingFPOp = Op.getOperand(2);
4091 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
4092 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
4095 "llvm.eh.recoverfp must take a function as the first argument");
4096 return IncomingFPOp;
4099 case Intrinsic::aarch64_neon_vsri:
4100 case Intrinsic::aarch64_neon_vsli: {
4101 EVT Ty = Op.getValueType();
4104 report_fatal_error("Unexpected type for aarch64_neon_vsli");
4106 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
4108 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
4109 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
4110 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
4114 case Intrinsic::aarch64_neon_srhadd:
4115 case Intrinsic::aarch64_neon_urhadd:
4116 case Intrinsic::aarch64_neon_shadd:
4117 case Intrinsic::aarch64_neon_uhadd: {
4118 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4119 IntNo == Intrinsic::aarch64_neon_shadd);
4120 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4121 IntNo == Intrinsic::aarch64_neon_urhadd);
4123 IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
4124 : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
4125 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4128 case Intrinsic::aarch64_neon_sabd:
4129 case Intrinsic::aarch64_neon_uabd: {
4130 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU
4132 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4135 case Intrinsic::aarch64_neon_uaddlp: {
4136 unsigned Opcode = AArch64ISD::UADDLP;
4137 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
4139 case Intrinsic::aarch64_neon_sdot:
4140 case Intrinsic::aarch64_neon_udot:
4141 case Intrinsic::aarch64_sve_sdot:
4142 case Intrinsic::aarch64_sve_udot: {
4143 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
4144 IntNo == Intrinsic::aarch64_sve_udot)
4147 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4148 Op.getOperand(2), Op.getOperand(3));
4153 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
4154 if (VT.getVectorElementType() == MVT::i8 ||
4155 VT.getVectorElementType() == MVT::i16) {
4162 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
4163 if (VT.getVectorElementType() == MVT::i32 &&
4164 VT.getVectorElementCount().getKnownMinValue() >= 4)
4170 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
4171 return ExtVal.getValueType().isScalableVector();
4174 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
4175 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
4176 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
4177 AArch64ISD::GLD1_MERGE_ZERO},
4178 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
4179 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
4180 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
4181 AArch64ISD::GLD1_MERGE_ZERO},
4182 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
4183 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
4184 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
4185 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
4186 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
4187 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
4188 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
4189 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
4190 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
4191 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
4193 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
4194 return AddrModes.find(Key)->second;
4197 unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
4198 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
4199 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
4200 AArch64ISD::SST1_PRED},
4201 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
4202 AArch64ISD::SST1_UXTW_PRED},
4203 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
4204 AArch64ISD::SST1_PRED},
4205 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
4206 AArch64ISD::SST1_SXTW_PRED},
4207 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
4208 AArch64ISD::SST1_SCALED_PRED},
4209 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
4210 AArch64ISD::SST1_UXTW_SCALED_PRED},
4211 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
4212 AArch64ISD::SST1_SCALED_PRED},
4213 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
4214 AArch64ISD::SST1_SXTW_SCALED_PRED},
4216 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
4217 return AddrModes.find(Key)->second;
4220 unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
4223 llvm_unreachable("unimplemented opcode");
4225 case AArch64ISD::GLD1_MERGE_ZERO:
4226 return AArch64ISD::GLD1S_MERGE_ZERO;
4227 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
4228 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
4229 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
4230 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
4231 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
4232 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
4233 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
4234 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
4235 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
4236 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
4237 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
4238 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
4242 bool getGatherScatterIndexIsExtended(SDValue Index) {
4243 unsigned Opcode = Index.getOpcode();
4244 if (Opcode == ISD::SIGN_EXTEND_INREG)
4247 if (Opcode == ISD::AND) {
4248 SDValue Splat = Index.getOperand(1);
4249 if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
4251 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
4252 if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
4260 // If the base pointer of a masked gather or scatter is null, we
4261 // may be able to swap BasePtr & Index and use the vector + register
4262 // or vector + immediate addressing mode, e.g.
4263 // VECTOR + REGISTER:
4264 // getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
4265 // -> getelementptr %offset, <vscale x N x T> %indices
4266 // VECTOR + IMMEDIATE:
4267 // getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
4268 // -> getelementptr #x, <vscale x N x T> %indices
4269 void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT,
4270 unsigned &Opcode, bool IsGather,
4271 SelectionDAG &DAG) {
4272 if (!isNullConstant(BasePtr))
4275 // FIXME: This will not match for fixed vector type codegen as the nodes in
4276 // question will have fixed<->scalable conversions around them. This should be
4277 // moved to a DAG combine or complex pattern so that is executes after all of
4278 // the fixed vector insert and extracts have been removed. This deficiency
4279 // will result in a sub-optimal addressing mode being used, i.e. an ADD not
4280 // being folded into the scatter/gather.
4281 ConstantSDNode *Offset = nullptr;
4282 if (Index.getOpcode() == ISD::ADD)
4283 if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
4284 if (isa<ConstantSDNode>(SplatVal))
4285 Offset = cast<ConstantSDNode>(SplatVal);
4288 Index = Index->getOperand(0);
4294 IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
4297 std::swap(BasePtr, Index);
4302 uint64_t OffsetVal = Offset->getZExtValue();
4303 unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
4304 auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
4306 if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) {
4307 // Index is out of range for the immediate addressing mode
4308 BasePtr = ConstOffset;
4309 Index = Index->getOperand(0);
4313 // Immediate is in range
4315 BasePtr = Index->getOperand(0);
4316 Index = ConstOffset;
4319 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
4320 SelectionDAG &DAG) const {
4322 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
4323 assert(MGT && "Can only custom lower gather load nodes");
4325 bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector();
4327 SDValue Index = MGT->getIndex();
4328 SDValue Chain = MGT->getChain();
4329 SDValue PassThru = MGT->getPassThru();
4330 SDValue Mask = MGT->getMask();
4331 SDValue BasePtr = MGT->getBasePtr();
4332 ISD::LoadExtType ExtTy = MGT->getExtensionType();
4334 ISD::MemIndexType IndexType = MGT->getIndexType();
4336 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
4338 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
4339 bool IdxNeedsExtend =
4340 getGatherScatterIndexIsExtended(Index) ||
4341 Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4342 bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
4344 EVT VT = PassThru.getSimpleValueType();
4345 EVT IndexVT = Index.getSimpleValueType();
4346 EVT MemVT = MGT->getMemoryVT();
4347 SDValue InputVT = DAG.getValueType(MemVT);
4349 if (VT.getVectorElementType() == MVT::bf16 &&
4350 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4353 if (IsFixedLength) {
4354 assert(Subtarget->useSVEForFixedLengthVectors() &&
4355 "Cannot lower when not using SVE for fixed vectors");
4356 if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
4357 IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
4358 MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
4360 MemVT = getContainerForFixedLengthVector(DAG, MemVT);
4361 IndexVT = MemVT.changeTypeToInteger();
4363 InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
4365 ISD::ZERO_EXTEND, DL,
4366 VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
4369 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
4370 PassThru = SDValue();
4372 if (VT.isFloatingPoint() && !IsFixedLength) {
4373 // Handle FP data by using an integer gather and casting the result.
4375 EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
4376 PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
4378 InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4381 SDVTList VTs = DAG.getVTList(IndexVT, MVT::Other);
4383 if (getGatherScatterIndexIsExtended(Index))
4384 Index = Index.getOperand(0);
4386 unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
4387 selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4388 /*isGather=*/true, DAG);
4390 if (ResNeedsSignExtend)
4391 Opcode = getSignExtendedGatherOpcode(Opcode);
4393 if (IsFixedLength) {
4394 if (Index.getSimpleValueType().isFixedLengthVector())
4395 Index = convertToScalableVector(DAG, IndexVT, Index);
4396 if (BasePtr.getSimpleValueType().isFixedLengthVector())
4397 BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
4398 Mask = convertFixedMaskToScalableVector(Mask, DAG);
4401 SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT};
4402 SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops);
4403 Chain = Result.getValue(1);
4405 if (IsFixedLength) {
4406 Result = convertFromScalableVector(
4407 DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()),
4409 Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result);
4410 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
4413 Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru);
4416 Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru);
4418 if (VT.isFloatingPoint())
4419 Result = getSVESafeBitCast(VT, Result, DAG);
4422 return DAG.getMergeValues({Result, Chain}, DL);
4425 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
4426 SelectionDAG &DAG) const {
4428 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
4429 assert(MSC && "Can only custom lower scatter store nodes");
4431 bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector();
4433 SDValue Index = MSC->getIndex();
4434 SDValue Chain = MSC->getChain();
4435 SDValue StoreVal = MSC->getValue();
4436 SDValue Mask = MSC->getMask();
4437 SDValue BasePtr = MSC->getBasePtr();
4439 ISD::MemIndexType IndexType = MSC->getIndexType();
4441 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
4443 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
4445 getGatherScatterIndexIsExtended(Index) ||
4446 Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4448 EVT VT = StoreVal.getSimpleValueType();
4449 EVT IndexVT = Index.getSimpleValueType();
4450 SDVTList VTs = DAG.getVTList(MVT::Other);
4451 EVT MemVT = MSC->getMemoryVT();
4452 SDValue InputVT = DAG.getValueType(MemVT);
4454 if (VT.getVectorElementType() == MVT::bf16 &&
4455 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4458 if (IsFixedLength) {
4459 assert(Subtarget->useSVEForFixedLengthVectors() &&
4460 "Cannot lower when not using SVE for fixed vectors");
4461 if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
4462 IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
4463 MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
4465 MemVT = getContainerForFixedLengthVector(DAG, MemVT);
4466 IndexVT = MemVT.changeTypeToInteger();
4468 InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
4471 DAG.getNode(ISD::BITCAST, DL, VT.changeTypeToInteger(), StoreVal);
4472 StoreVal = DAG.getNode(
4473 ISD::ANY_EXTEND, DL,
4474 VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
4475 StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal);
4477 ISD::ZERO_EXTEND, DL,
4478 VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
4479 } else if (VT.isFloatingPoint()) {
4480 // Handle FP data by casting the data so an integer scatter can be used.
4481 EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
4482 StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
4483 InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4486 if (getGatherScatterIndexIsExtended(Index))
4487 Index = Index.getOperand(0);
4489 unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
4490 selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4491 /*isGather=*/false, DAG);
4493 if (IsFixedLength) {
4494 if (Index.getSimpleValueType().isFixedLengthVector())
4495 Index = convertToScalableVector(DAG, IndexVT, Index);
4496 if (BasePtr.getSimpleValueType().isFixedLengthVector())
4497 BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
4498 Mask = convertFixedMaskToScalableVector(Mask, DAG);
4501 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
4502 return DAG.getNode(Opcode, DL, VTs, Ops);
4505 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
4507 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
4508 assert(LoadNode && "Expected custom lowering of a masked load node");
4509 EVT VT = Op->getValueType(0);
4511 if (useSVEForFixedLengthVectorVT(VT, true))
4512 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
4514 SDValue PassThru = LoadNode->getPassThru();
4515 SDValue Mask = LoadNode->getMask();
4517 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
4520 SDValue Load = DAG.getMaskedLoad(
4521 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
4522 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
4523 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
4524 LoadNode->getExtensionType());
4526 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
4528 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
4531 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
4532 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
4534 SelectionDAG &DAG) {
4535 assert(VT.isVector() && "VT should be a vector type");
4536 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
4538 SDValue Value = ST->getValue();
4540 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
4541 // the word lane which represent the v4i8 subvector. It optimizes the store
4547 SDValue Undef = DAG.getUNDEF(MVT::i16);
4548 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
4549 {Undef, Undef, Undef, Undef});
4551 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
4553 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
4555 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
4556 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
4557 Trunc, DAG.getConstant(0, DL, MVT::i64));
4559 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
4560 ST->getBasePtr(), ST->getMemOperand());
4563 // Custom lowering for any store, vector or scalar and/or default or with
4564 // a truncate operations. Currently only custom lower truncate operation
4565 // from vector v4i16 to v4i8 or volatile stores of i128.
4566 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
4567 SelectionDAG &DAG) const {
4569 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
4570 assert (StoreNode && "Can only custom lower store nodes");
4572 SDValue Value = StoreNode->getValue();
4574 EVT VT = Value.getValueType();
4575 EVT MemVT = StoreNode->getMemoryVT();
4577 if (VT.isVector()) {
4578 if (useSVEForFixedLengthVectorVT(VT, true))
4579 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
4581 unsigned AS = StoreNode->getAddressSpace();
4582 Align Alignment = StoreNode->getAlign();
4583 if (Alignment < MemVT.getStoreSize() &&
4584 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
4585 StoreNode->getMemOperand()->getFlags(),
4587 return scalarizeVectorStore(StoreNode, DAG);
4590 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
4591 MemVT == MVT::v4i8) {
4592 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
4594 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
4595 // the custom lowering, as there are no un-paired non-temporal stores and
4596 // legalization will break up 256 bit inputs.
4597 ElementCount EC = MemVT.getVectorElementCount();
4598 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
4600 ((MemVT.getScalarSizeInBits() == 8u ||
4601 MemVT.getScalarSizeInBits() == 16u ||
4602 MemVT.getScalarSizeInBits() == 32u ||
4603 MemVT.getScalarSizeInBits() == 64u))) {
4605 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
4606 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4607 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
4609 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
4610 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4611 StoreNode->getValue(),
4612 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
4613 SDValue Result = DAG.getMemIntrinsicNode(
4614 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
4615 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4616 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4619 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
4620 assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
4622 DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
4623 DAG.getConstant(0, Dl, MVT::i64));
4625 DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
4626 DAG.getConstant(1, Dl, MVT::i64));
4627 SDValue Result = DAG.getMemIntrinsicNode(
4628 AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
4629 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4630 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4632 } else if (MemVT == MVT::i64x8) {
4633 SDValue Value = StoreNode->getValue();
4634 assert(Value->getValueType(0) == MVT::i64x8);
4635 SDValue Chain = StoreNode->getChain();
4636 SDValue Base = StoreNode->getBasePtr();
4637 EVT PtrVT = Base.getValueType();
4638 for (unsigned i = 0; i < 8; i++) {
4639 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
4640 Value, DAG.getConstant(i, Dl, MVT::i32));
4641 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
4642 DAG.getConstant(i * 8, Dl, PtrVT));
4643 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
4644 StoreNode->getOriginalAlign());
4652 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
4653 SelectionDAG &DAG) const {
4655 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
4656 assert(LoadNode && "Expected custom lowering of a load node");
4658 if (LoadNode->getMemoryVT() == MVT::i64x8) {
4659 SmallVector<SDValue, 8> Ops;
4660 SDValue Base = LoadNode->getBasePtr();
4661 SDValue Chain = LoadNode->getChain();
4662 EVT PtrVT = Base.getValueType();
4663 for (unsigned i = 0; i < 8; i++) {
4664 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
4665 DAG.getConstant(i * 8, DL, PtrVT));
4666 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
4667 LoadNode->getPointerInfo(),
4668 LoadNode->getOriginalAlign());
4669 Ops.push_back(Part);
4670 Chain = SDValue(Part.getNode(), 1);
4672 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
4673 return DAG.getMergeValues({Loaded, Chain}, DL);
4676 // Custom lowering for extending v4i8 vector loads.
4677 EVT VT = Op->getValueType(0);
4678 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
4680 if (LoadNode->getMemoryVT() != MVT::v4i8)
4684 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
4685 ExtType = ISD::SIGN_EXTEND;
4686 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
4687 LoadNode->getExtensionType() == ISD::EXTLOAD)
4688 ExtType = ISD::ZERO_EXTEND;
4692 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
4693 LoadNode->getBasePtr(), MachinePointerInfo());
4694 SDValue Chain = Load.getValue(1);
4695 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
4696 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
4697 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
4698 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
4699 DAG.getConstant(0, DL, MVT::i64));
4700 if (VT == MVT::v4i32)
4701 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
4702 return DAG.getMergeValues({Ext, Chain}, DL);
4705 // Generate SUBS and CSEL for integer abs.
4706 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
4707 MVT VT = Op.getSimpleValueType();
4710 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
4713 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
4715 // Generate SUBS & CSEL.
4717 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
4718 Op.getOperand(0), DAG.getConstant(0, DL, VT));
4719 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
4720 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
4724 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
4725 SelectionDAG &DAG) const {
4726 LLVM_DEBUG(dbgs() << "Custom lowering: ");
4727 LLVM_DEBUG(Op.dump());
4729 switch (Op.getOpcode()) {
4731 llvm_unreachable("unimplemented operand");
4734 return LowerBITCAST(Op, DAG);
4735 case ISD::GlobalAddress:
4736 return LowerGlobalAddress(Op, DAG);
4737 case ISD::GlobalTLSAddress:
4738 return LowerGlobalTLSAddress(Op, DAG);
4740 case ISD::STRICT_FSETCC:
4741 case ISD::STRICT_FSETCCS:
4742 return LowerSETCC(Op, DAG);
4744 return LowerBR_CC(Op, DAG);
4746 return LowerSELECT(Op, DAG);
4747 case ISD::SELECT_CC:
4748 return LowerSELECT_CC(Op, DAG);
4749 case ISD::JumpTable:
4750 return LowerJumpTable(Op, DAG);
4752 return LowerBR_JT(Op, DAG);
4753 case ISD::ConstantPool:
4754 return LowerConstantPool(Op, DAG);
4755 case ISD::BlockAddress:
4756 return LowerBlockAddress(Op, DAG);
4758 return LowerVASTART(Op, DAG);
4760 return LowerVACOPY(Op, DAG);
4762 return LowerVAARG(Op, DAG);
4767 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
4774 return LowerXALUO(Op, DAG);
4776 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
4778 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
4780 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
4782 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
4784 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
4786 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
4788 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
4790 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
4791 case ISD::FNEARBYINT:
4792 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
4794 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
4796 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
4797 case ISD::FROUNDEVEN:
4798 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
4800 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
4802 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
4804 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
4806 case ISD::STRICT_FP_ROUND:
4807 return LowerFP_ROUND(Op, DAG);
4808 case ISD::FP_EXTEND:
4809 return LowerFP_EXTEND(Op, DAG);
4810 case ISD::FRAMEADDR:
4811 return LowerFRAMEADDR(Op, DAG);
4812 case ISD::SPONENTRY:
4813 return LowerSPONENTRY(Op, DAG);
4814 case ISD::RETURNADDR:
4815 return LowerRETURNADDR(Op, DAG);
4816 case ISD::ADDROFRETURNADDR:
4817 return LowerADDROFRETURNADDR(Op, DAG);
4818 case ISD::CONCAT_VECTORS:
4819 return LowerCONCAT_VECTORS(Op, DAG);
4820 case ISD::INSERT_VECTOR_ELT:
4821 return LowerINSERT_VECTOR_ELT(Op, DAG);
4822 case ISD::EXTRACT_VECTOR_ELT:
4823 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
4824 case ISD::BUILD_VECTOR:
4825 return LowerBUILD_VECTOR(Op, DAG);
4826 case ISD::VECTOR_SHUFFLE:
4827 return LowerVECTOR_SHUFFLE(Op, DAG);
4828 case ISD::SPLAT_VECTOR:
4829 return LowerSPLAT_VECTOR(Op, DAG);
4830 case ISD::EXTRACT_SUBVECTOR:
4831 return LowerEXTRACT_SUBVECTOR(Op, DAG);
4832 case ISD::INSERT_SUBVECTOR:
4833 return LowerINSERT_SUBVECTOR(Op, DAG);
4836 return LowerDIV(Op, DAG);
4838 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
4839 /*OverrideNEON=*/true);
4841 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
4842 /*OverrideNEON=*/true);
4844 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
4845 /*OverrideNEON=*/true);
4847 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
4848 /*OverrideNEON=*/true);
4852 return LowerVectorSRA_SRL_SHL(Op, DAG);
4853 case ISD::SHL_PARTS:
4854 case ISD::SRL_PARTS:
4855 case ISD::SRA_PARTS:
4856 return LowerShiftParts(Op, DAG);
4858 return LowerCTPOP(Op, DAG);
4859 case ISD::FCOPYSIGN:
4860 return LowerFCOPYSIGN(Op, DAG);
4862 return LowerVectorOR(Op, DAG);
4864 return LowerXOR(Op, DAG);
4866 return LowerPREFETCH(Op, DAG);
4867 case ISD::SINT_TO_FP:
4868 case ISD::UINT_TO_FP:
4869 case ISD::STRICT_SINT_TO_FP:
4870 case ISD::STRICT_UINT_TO_FP:
4871 return LowerINT_TO_FP(Op, DAG);
4872 case ISD::FP_TO_SINT:
4873 case ISD::FP_TO_UINT:
4874 case ISD::STRICT_FP_TO_SINT:
4875 case ISD::STRICT_FP_TO_UINT:
4876 return LowerFP_TO_INT(Op, DAG);
4877 case ISD::FP_TO_SINT_SAT:
4878 case ISD::FP_TO_UINT_SAT:
4879 return LowerFP_TO_INT_SAT(Op, DAG);
4881 return LowerFSINCOS(Op, DAG);
4882 case ISD::FLT_ROUNDS_:
4883 return LowerFLT_ROUNDS_(Op, DAG);
4884 case ISD::SET_ROUNDING:
4885 return LowerSET_ROUNDING(Op, DAG);
4887 return LowerMUL(Op, DAG);
4889 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED,
4890 /*OverrideNEON=*/true);
4892 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,
4893 /*OverrideNEON=*/true);
4894 case ISD::INTRINSIC_WO_CHAIN:
4895 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
4897 return LowerSTORE(Op, DAG);
4899 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
4901 return LowerMGATHER(Op, DAG);
4903 return LowerMSCATTER(Op, DAG);
4904 case ISD::VECREDUCE_SEQ_FADD:
4905 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
4906 case ISD::VECREDUCE_ADD:
4907 case ISD::VECREDUCE_AND:
4908 case ISD::VECREDUCE_OR:
4909 case ISD::VECREDUCE_XOR:
4910 case ISD::VECREDUCE_SMAX:
4911 case ISD::VECREDUCE_SMIN:
4912 case ISD::VECREDUCE_UMAX:
4913 case ISD::VECREDUCE_UMIN:
4914 case ISD::VECREDUCE_FADD:
4915 case ISD::VECREDUCE_FMAX:
4916 case ISD::VECREDUCE_FMIN:
4917 return LowerVECREDUCE(Op, DAG);
4918 case ISD::ATOMIC_LOAD_SUB:
4919 return LowerATOMIC_LOAD_SUB(Op, DAG);
4920 case ISD::ATOMIC_LOAD_AND:
4921 return LowerATOMIC_LOAD_AND(Op, DAG);
4922 case ISD::DYNAMIC_STACKALLOC:
4923 return LowerDYNAMIC_STACKALLOC(Op, DAG);
4925 return LowerVSCALE(Op, DAG);
4926 case ISD::ANY_EXTEND:
4927 case ISD::SIGN_EXTEND:
4928 case ISD::ZERO_EXTEND:
4929 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
4930 case ISD::SIGN_EXTEND_INREG: {
4931 // Only custom lower when ExtraVT has a legal byte based element type.
4932 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4933 EVT ExtraEltVT = ExtraVT.getVectorElementType();
4934 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
4935 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
4938 return LowerToPredicatedOp(Op, DAG,
4939 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
4942 return LowerTRUNCATE(Op, DAG);
4944 return LowerMLOAD(Op, DAG);
4946 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
4947 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
4948 return LowerLOAD(Op, DAG);
4950 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
4952 return LowerToScalableOp(Op, DAG);
4954 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
4956 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
4958 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
4960 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
4962 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
4964 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
4966 return LowerABS(Op, DAG);
4967 case ISD::BITREVERSE:
4968 return LowerBitreverse(Op, DAG);
4970 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
4972 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
4973 /*OverrideNEON=*/true);
4975 return LowerCTTZ(Op, DAG);
4976 case ISD::VECTOR_SPLICE:
4977 return LowerVECTOR_SPLICE(Op, DAG);
4981 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
4982 return !Subtarget->useSVEForFixedLengthVectors();
4985 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
4986 EVT VT, bool OverrideNEON) const {
4987 if (!Subtarget->useSVEForFixedLengthVectors())
4990 if (!VT.isFixedLengthVector())
4993 // Don't use SVE for vectors we cannot scalarize if required.
4994 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
4995 // Fixed length predicates should be promoted to i8.
4996 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
5010 // All SVE implementations support NEON sized vectors.
5011 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
5014 // Ensure NEON MVTs only belong to a single register class.
5015 if (VT.getFixedSizeInBits() <= 128)
5018 // Don't use SVE for types that don't fit.
5019 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
5022 // TODO: Perhaps an artificial restriction, but worth having whilst getting
5023 // the base fixed length SVE support in place.
5024 if (!VT.isPow2VectorType())
5030 //===----------------------------------------------------------------------===//
5031 // Calling Convention Implementation
5032 //===----------------------------------------------------------------------===//
5034 /// Selects the correct CCAssignFn for a given CallingConvention value.
5035 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
5036 bool IsVarArg) const {
5039 report_fatal_error("Unsupported calling convention.");
5040 case CallingConv::WebKit_JS:
5041 return CC_AArch64_WebKit_JS;
5042 case CallingConv::GHC:
5043 return CC_AArch64_GHC;
5044 case CallingConv::C:
5045 case CallingConv::Fast:
5046 case CallingConv::PreserveMost:
5047 case CallingConv::CXX_FAST_TLS:
5048 case CallingConv::Swift:
5049 case CallingConv::SwiftTail:
5050 case CallingConv::Tail:
5051 if (Subtarget->isTargetWindows() && IsVarArg)
5052 return CC_AArch64_Win64_VarArg;
5053 if (!Subtarget->isTargetDarwin())
5054 return CC_AArch64_AAPCS;
5056 return CC_AArch64_DarwinPCS;
5057 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
5058 : CC_AArch64_DarwinPCS_VarArg;
5059 case CallingConv::Win64:
5060 return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
5061 case CallingConv::CFGuard_Check:
5062 return CC_AArch64_Win64_CFGuard_Check;
5063 case CallingConv::AArch64_VectorCall:
5064 case CallingConv::AArch64_SVE_VectorCall:
5065 return CC_AArch64_AAPCS;
5070 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
5071 return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
5072 : RetCC_AArch64_AAPCS;
5075 SDValue AArch64TargetLowering::LowerFormalArguments(
5076 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
5077 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
5078 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
5079 MachineFunction &MF = DAG.getMachineFunction();
5080 MachineFrameInfo &MFI = MF.getFrameInfo();
5081 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
5083 // Assign locations to all of the incoming arguments.
5084 SmallVector<CCValAssign, 16> ArgLocs;
5085 DenseMap<unsigned, SDValue> CopiedRegs;
5086 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
5089 // At this point, Ins[].VT may already be promoted to i32. To correctly
5090 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
5091 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
5092 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
5093 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
5095 unsigned NumArgs = Ins.size();
5096 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
5097 unsigned CurArgIdx = 0;
5098 for (unsigned i = 0; i != NumArgs; ++i) {
5099 MVT ValVT = Ins[i].VT;
5100 if (Ins[i].isOrigArg()) {
5101 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
5102 CurArgIdx = Ins[i].getOrigArgIndex();
5104 // Get type of the original argument.
5105 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
5106 /*AllowUnknown*/ true);
5107 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
5108 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
5109 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
5111 else if (ActualMVT == MVT::i16)
5114 bool UseVarArgCC = false;
5116 UseVarArgCC = isVarArg;
5117 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
5119 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
5120 assert(!Res && "Call operand has unhandled type");
5123 SmallVector<SDValue, 16> ArgValues;
5124 unsigned ExtraArgLocs = 0;
5125 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5126 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
5128 if (Ins[i].Flags.isByVal()) {
5129 // Byval is used for HFAs in the PCS, but the system should work in a
5130 // non-compliant manner for larger structs.
5131 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5132 int Size = Ins[i].Flags.getByValSize();
5133 unsigned NumRegs = (Size + 7) / 8;
5135 // FIXME: This works on big-endian for composite byvals, which are the common
5136 // case. It should also work for fundamental types too.
5138 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
5139 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
5140 InVals.push_back(FrameIdxN);
5145 if (Ins[i].Flags.isSwiftAsync())
5146 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5149 if (VA.isRegLoc()) {
5150 // Arguments stored in registers.
5151 EVT RegVT = VA.getLocVT();
5152 const TargetRegisterClass *RC;
5154 if (RegVT == MVT::i32)
5155 RC = &AArch64::GPR32RegClass;
5156 else if (RegVT == MVT::i64)
5157 RC = &AArch64::GPR64RegClass;
5158 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
5159 RC = &AArch64::FPR16RegClass;
5160 else if (RegVT == MVT::f32)
5161 RC = &AArch64::FPR32RegClass;
5162 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
5163 RC = &AArch64::FPR64RegClass;
5164 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
5165 RC = &AArch64::FPR128RegClass;
5166 else if (RegVT.isScalableVector() &&
5167 RegVT.getVectorElementType() == MVT::i1)
5168 RC = &AArch64::PPRRegClass;
5169 else if (RegVT.isScalableVector())
5170 RC = &AArch64::ZPRRegClass;
5172 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
5174 // Transform the arguments in physical registers into virtual ones.
5175 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
5176 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
5178 // If this is an 8, 16 or 32-bit value, it is really passed promoted
5179 // to 64 bits. Insert an assert[sz]ext to capture this, then
5180 // truncate to the right size.
5181 switch (VA.getLocInfo()) {
5183 llvm_unreachable("Unknown loc info!");
5184 case CCValAssign::Full:
5186 case CCValAssign::Indirect:
5187 assert(VA.getValVT().isScalableVector() &&
5188 "Only scalable vectors can be passed indirectly");
5190 case CCValAssign::BCvt:
5191 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
5193 case CCValAssign::AExt:
5194 case CCValAssign::SExt:
5195 case CCValAssign::ZExt:
5197 case CCValAssign::AExtUpper:
5198 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
5199 DAG.getConstant(32, DL, RegVT));
5200 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
5203 } else { // VA.isRegLoc()
5204 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
5205 unsigned ArgOffset = VA.getLocMemOffset();
5206 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
5207 ? VA.getLocVT().getSizeInBits()
5208 : VA.getValVT().getSizeInBits()) / 8;
5210 uint32_t BEAlign = 0;
5211 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
5212 !Ins[i].Flags.isInConsecutiveRegs())
5213 BEAlign = 8 - ArgSize;
5215 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
5217 // Create load nodes to retrieve arguments from the stack.
5218 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
5220 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
5221 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
5222 MVT MemVT = VA.getValVT();
5224 switch (VA.getLocInfo()) {
5227 case CCValAssign::Trunc:
5228 case CCValAssign::BCvt:
5229 MemVT = VA.getLocVT();
5231 case CCValAssign::Indirect:
5232 assert(VA.getValVT().isScalableVector() &&
5233 "Only scalable vectors can be passed indirectly");
5234 MemVT = VA.getLocVT();
5236 case CCValAssign::SExt:
5237 ExtType = ISD::SEXTLOAD;
5239 case CCValAssign::ZExt:
5240 ExtType = ISD::ZEXTLOAD;
5242 case CCValAssign::AExt:
5243 ExtType = ISD::EXTLOAD;
5247 ArgValue = DAG.getExtLoad(
5248 ExtType, DL, VA.getLocVT(), Chain, FIN,
5249 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
5253 if (VA.getLocInfo() == CCValAssign::Indirect) {
5254 assert(VA.getValVT().isScalableVector() &&
5255 "Only scalable vectors can be passed indirectly");
5257 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
5258 unsigned NumParts = 1;
5259 if (Ins[i].Flags.isInConsecutiveRegs()) {
5260 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
5261 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
5265 MVT PartLoad = VA.getValVT();
5266 SDValue Ptr = ArgValue;
5268 // Ensure we generate all loads for each tuple part, whilst updating the
5269 // pointer after each load correctly using vscale.
5270 while (NumParts > 0) {
5271 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
5272 InVals.push_back(ArgValue);
5275 SDValue BytesIncrement = DAG.getVScale(
5276 DL, Ptr.getValueType(),
5277 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
5279 Flags.setNoUnsignedWrap(true);
5280 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5281 BytesIncrement, Flags);
5287 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
5288 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
5289 ArgValue, DAG.getValueType(MVT::i32));
5290 InVals.push_back(ArgValue);
5293 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
5296 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5298 if (!Subtarget->isTargetDarwin() || IsWin64) {
5299 // The AAPCS variadic function ABI is identical to the non-variadic
5300 // one. As a result there may be more arguments in registers and we should
5301 // save them for future reference.
5302 // Win64 variadic functions also pass arguments in registers, but all float
5303 // arguments are passed in integer registers.
5304 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
5307 // This will point to the next argument passed via stack.
5308 unsigned StackOffset = CCInfo.getNextStackOffset();
5309 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
5310 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
5311 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
5313 if (MFI.hasMustTailInVarArgFunc()) {
5314 SmallVector<MVT, 2> RegParmTypes;
5315 RegParmTypes.push_back(MVT::i64);
5316 RegParmTypes.push_back(MVT::f128);
5317 // Compute the set of forwarded registers. The rest are scratch.
5318 SmallVectorImpl<ForwardedRegister> &Forwards =
5319 FuncInfo->getForwardedMustTailRegParms();
5320 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
5323 // Conservatively forward X8, since it might be used for aggregate return.
5324 if (!CCInfo.isAllocated(AArch64::X8)) {
5325 unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
5326 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
5331 // On Windows, InReg pointers must be returned, so record the pointer in a
5332 // virtual register at the start of the function so it can be returned in the
5335 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
5336 if (Ins[I].Flags.isInReg()) {
5337 assert(!FuncInfo->getSRetReturnReg());
5339 MVT PtrTy = getPointerTy(DAG.getDataLayout());
5341 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
5342 FuncInfo->setSRetReturnReg(Reg);
5344 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
5345 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
5351 unsigned StackArgSize = CCInfo.getNextStackOffset();
5352 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
5353 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
5354 // This is a non-standard ABI so by fiat I say we're allowed to make full
5355 // use of the stack area to be popped, which must be aligned to 16 bytes in
5357 StackArgSize = alignTo(StackArgSize, 16);
5359 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
5360 // a multiple of 16.
5361 FuncInfo->setArgumentStackToRestore(StackArgSize);
5363 // This realignment carries over to the available bytes below. Our own
5364 // callers will guarantee the space is free by giving an aligned value to
5367 // Even if we're not expected to free up the space, it's useful to know how
5368 // much is there while considering tail calls (because we can reuse it).
5369 FuncInfo->setBytesInStackArgArea(StackArgSize);
5371 if (Subtarget->hasCustomCallingConv())
5372 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
5377 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
5380 SDValue &Chain) const {
5381 MachineFunction &MF = DAG.getMachineFunction();
5382 MachineFrameInfo &MFI = MF.getFrameInfo();
5383 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5384 auto PtrVT = getPointerTy(DAG.getDataLayout());
5385 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
5387 SmallVector<SDValue, 8> MemOps;
5389 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
5390 AArch64::X3, AArch64::X4, AArch64::X5,
5391 AArch64::X6, AArch64::X7 };
5392 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
5393 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
5395 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
5397 if (GPRSaveSize != 0) {
5399 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
5400 if (GPRSaveSize & 15)
5401 // The extra size here, if triggered, will always be 8.
5402 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
5404 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
5406 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
5408 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
5409 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
5410 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
5411 SDValue Store = DAG.getStore(
5412 Val.getValue(1), DL, Val, FIN,
5414 ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
5416 (i - FirstVariadicGPR) * 8)
5417 : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
5418 MemOps.push_back(Store);
5420 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
5423 FuncInfo->setVarArgsGPRIndex(GPRIdx);
5424 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
5426 if (Subtarget->hasFPARMv8() && !IsWin64) {
5427 static const MCPhysReg FPRArgRegs[] = {
5428 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
5429 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
5430 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
5431 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
5433 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
5435 if (FPRSaveSize != 0) {
5436 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
5438 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
5440 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
5441 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
5442 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
5444 SDValue Store = DAG.getStore(
5445 Val.getValue(1), DL, Val, FIN,
5446 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
5447 MemOps.push_back(Store);
5448 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
5449 DAG.getConstant(16, DL, PtrVT));
5452 FuncInfo->setVarArgsFPRIndex(FPRIdx);
5453 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
5456 if (!MemOps.empty()) {
5457 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
5461 /// LowerCallResult - Lower the result values of a call into the
5462 /// appropriate copies out of appropriate physical registers.
5463 SDValue AArch64TargetLowering::LowerCallResult(
5464 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
5465 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
5466 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
5467 SDValue ThisVal) const {
5468 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
5469 // Assign locations to each value returned by this call.
5470 SmallVector<CCValAssign, 16> RVLocs;
5471 DenseMap<unsigned, SDValue> CopiedRegs;
5472 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5474 CCInfo.AnalyzeCallResult(Ins, RetCC);
5476 // Copy all of the result registers out of their specified physreg.
5477 for (unsigned i = 0; i != RVLocs.size(); ++i) {
5478 CCValAssign VA = RVLocs[i];
5480 // Pass 'this' value directly from the argument to return value, to avoid
5481 // reg unit interference
5482 if (i == 0 && isThisReturn) {
5483 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
5484 "unexpected return calling convention register assignment");
5485 InVals.push_back(ThisVal);
5489 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
5490 // allows one use of a physreg per block.
5491 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
5494 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
5495 Chain = Val.getValue(1);
5496 InFlag = Val.getValue(2);
5497 CopiedRegs[VA.getLocReg()] = Val;
5500 switch (VA.getLocInfo()) {
5502 llvm_unreachable("Unknown loc info!");
5503 case CCValAssign::Full:
5505 case CCValAssign::BCvt:
5506 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
5508 case CCValAssign::AExtUpper:
5509 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
5510 DAG.getConstant(32, DL, VA.getLocVT()));
5512 case CCValAssign::AExt:
5514 case CCValAssign::ZExt:
5515 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
5519 InVals.push_back(Val);
5525 /// Return true if the calling convention is one that we can guarantee TCO for.
5526 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
5527 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
5528 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
5531 /// Return true if we might ever do TCO for calls with this calling convention.
5532 static bool mayTailCallThisCC(CallingConv::ID CC) {
5534 case CallingConv::C:
5535 case CallingConv::AArch64_SVE_VectorCall:
5536 case CallingConv::PreserveMost:
5537 case CallingConv::Swift:
5538 case CallingConv::SwiftTail:
5539 case CallingConv::Tail:
5540 case CallingConv::Fast:
5547 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
5548 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
5549 const SmallVectorImpl<ISD::OutputArg> &Outs,
5550 const SmallVectorImpl<SDValue> &OutVals,
5551 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5552 if (!mayTailCallThisCC(CalleeCC))
5555 MachineFunction &MF = DAG.getMachineFunction();
5556 const Function &CallerF = MF.getFunction();
5557 CallingConv::ID CallerCC = CallerF.getCallingConv();
5559 // Functions using the C or Fast calling convention that have an SVE signature
5560 // preserve more registers and should assume the SVE_VectorCall CC.
5561 // The check for matching callee-saved regs will determine whether it is
5562 // eligible for TCO.
5563 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
5564 AArch64RegisterInfo::hasSVEArgsOrReturn(&MF))
5565 CallerCC = CallingConv::AArch64_SVE_VectorCall;
5567 bool CCMatch = CallerCC == CalleeCC;
5569 // When using the Windows calling convention on a non-windows OS, we want
5570 // to back up and restore X18 in such functions; we can't do a tail call
5571 // from those functions.
5572 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
5573 CalleeCC != CallingConv::Win64)
5576 // Byval parameters hand the function a pointer directly into the stack area
5577 // we want to reuse during a tail call. Working around this *is* possible (see
5578 // X86) but less efficient and uglier in LowerCall.
5579 for (Function::const_arg_iterator i = CallerF.arg_begin(),
5580 e = CallerF.arg_end();
5582 if (i->hasByValAttr())
5585 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
5586 // In this case, it is necessary to save/restore X0 in the callee. Tail
5587 // call opt interferes with this. So we disable tail call opt when the
5588 // caller has an argument with "inreg" attribute.
5590 // FIXME: Check whether the callee also has an "inreg" argument.
5591 if (i->hasInRegAttr())
5595 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
5598 // Externally-defined functions with weak linkage should not be
5599 // tail-called on AArch64 when the OS does not support dynamic
5600 // pre-emption of symbols, as the AAELF spec requires normal calls
5601 // to undefined weak functions to be replaced with a NOP or jump to the
5602 // next instruction. The behaviour of branch instructions in this
5603 // situation (as used for tail calls) is implementation-defined, so we
5604 // cannot rely on the linker replacing the tail call with a return.
5605 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
5606 const GlobalValue *GV = G->getGlobal();
5607 const Triple &TT = getTargetMachine().getTargetTriple();
5608 if (GV->hasExternalWeakLinkage() &&
5609 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
5613 // Now we search for cases where we can use a tail call without changing the
5614 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
5617 // I want anyone implementing a new calling convention to think long and hard
5618 // about this assert.
5619 assert((!isVarArg || CalleeCC == CallingConv::C) &&
5620 "Unexpected variadic calling convention");
5622 LLVMContext &C = *DAG.getContext();
5623 if (isVarArg && !Outs.empty()) {
5624 // At least two cases here: if caller is fastcc then we can't have any
5625 // memory arguments (we'd be expected to clean up the stack afterwards). If
5626 // caller is C then we could potentially use its argument area.
5628 // FIXME: for now we take the most conservative of these in both cases:
5629 // disallow all variadic memory operands.
5630 SmallVector<CCValAssign, 16> ArgLocs;
5631 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5633 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
5634 for (const CCValAssign &ArgLoc : ArgLocs)
5635 if (!ArgLoc.isRegLoc())
5639 // Check that the call results are passed in the same way.
5640 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
5641 CCAssignFnForCall(CalleeCC, isVarArg),
5642 CCAssignFnForCall(CallerCC, isVarArg)))
5644 // The callee has to preserve all registers the caller needs to preserve.
5645 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5646 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5648 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5649 if (Subtarget->hasCustomCallingConv()) {
5650 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
5651 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
5653 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5657 // Nothing more to check if the callee is taking no arguments
5661 SmallVector<CCValAssign, 16> ArgLocs;
5662 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5664 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
5666 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5668 // If any of the arguments is passed indirectly, it must be SVE, so the
5669 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
5670 // allocate space on the stack. That is why we determine this explicitly here
5671 // the call cannot be a tailcall.
5672 if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
5673 assert((A.getLocInfo() != CCValAssign::Indirect ||
5674 A.getValVT().isScalableVector()) &&
5675 "Expected value to be scalable");
5676 return A.getLocInfo() == CCValAssign::Indirect;
5680 // If the stack arguments for this call do not fit into our own save area then
5681 // the call cannot be made tail.
5682 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
5685 const MachineRegisterInfo &MRI = MF.getRegInfo();
5686 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5692 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
5694 MachineFrameInfo &MFI,
5695 int ClobberedFI) const {
5696 SmallVector<SDValue, 8> ArgChains;
5697 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
5698 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
5700 // Include the original chain at the beginning of the list. When this is
5701 // used by target LowerCall hooks, this helps legalize find the
5702 // CALLSEQ_BEGIN node.
5703 ArgChains.push_back(Chain);
5705 // Add a chain value for each stack argument corresponding
5706 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
5707 UE = DAG.getEntryNode().getNode()->use_end();
5709 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
5710 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
5711 if (FI->getIndex() < 0) {
5712 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
5713 int64_t InLastByte = InFirstByte;
5714 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
5716 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
5717 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
5718 ArgChains.push_back(SDValue(L, 1));
5721 // Build a tokenfactor for all the chains.
5722 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
5725 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
5726 bool TailCallOpt) const {
5727 return (CallCC == CallingConv::Fast && TailCallOpt) ||
5728 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
5731 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
5732 /// and add input and output parameter nodes.
5734 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
5735 SmallVectorImpl<SDValue> &InVals) const {
5736 SelectionDAG &DAG = CLI.DAG;
5738 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
5739 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
5740 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
5741 SDValue Chain = CLI.Chain;
5742 SDValue Callee = CLI.Callee;
5743 bool &IsTailCall = CLI.IsTailCall;
5744 CallingConv::ID CallConv = CLI.CallConv;
5745 bool IsVarArg = CLI.IsVarArg;
5747 MachineFunction &MF = DAG.getMachineFunction();
5748 MachineFunction::CallSiteInfo CSInfo;
5749 bool IsThisReturn = false;
5751 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5752 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
5753 bool IsSibCall = false;
5754 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CallConv);
5756 // Check callee args/returns for SVE registers and set calling convention
5758 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
5759 bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
5760 return Out.VT.isScalableVector();
5762 bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
5763 return In.VT.isScalableVector();
5766 if (CalleeInSVE || CalleeOutSVE)
5767 CallConv = CallingConv::AArch64_SVE_VectorCall;
5771 // Check if it's really possible to do a tail call.
5772 IsTailCall = isEligibleForTailCallOptimization(
5773 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
5775 // A sibling call is one where we're under the usual C ABI and not planning
5776 // to change that but can still do a tail call:
5777 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
5778 CallConv != CallingConv::SwiftTail)
5785 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
5786 report_fatal_error("failed to perform tail call elimination on a call "
5787 "site marked musttail");
5789 // Analyze operands of the call, assigning locations to each operand.
5790 SmallVector<CCValAssign, 16> ArgLocs;
5791 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
5795 // Handle fixed and variable vector arguments differently.
5796 // Variable vector arguments always go into memory.
5797 unsigned NumArgs = Outs.size();
5799 for (unsigned i = 0; i != NumArgs; ++i) {
5800 MVT ArgVT = Outs[i].VT;
5801 if (!Outs[i].IsFixed && ArgVT.isScalableVector())
5802 report_fatal_error("Passing SVE types to variadic functions is "
5803 "currently not supported");
5805 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5806 bool UseVarArgCC = !Outs[i].IsFixed;
5807 // On Windows, the fixed arguments in a vararg call are passed in GPRs
5808 // too, so use the vararg CC to force them to integer registers.
5811 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
5812 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
5813 assert(!Res && "Call operand has unhandled type");
5817 // At this point, Outs[].VT may already be promoted to i32. To correctly
5818 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
5819 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
5820 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
5821 // we use a special version of AnalyzeCallOperands to pass in ValVT and
5823 unsigned NumArgs = Outs.size();
5824 for (unsigned i = 0; i != NumArgs; ++i) {
5825 MVT ValVT = Outs[i].VT;
5826 // Get type of the original argument.
5827 EVT ActualVT = getValueType(DAG.getDataLayout(),
5828 CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
5829 /*AllowUnknown*/ true);
5830 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
5831 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5832 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
5833 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
5835 else if (ActualMVT == MVT::i16)
5838 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
5839 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
5840 assert(!Res && "Call operand has unhandled type");
5845 // Get a count of how many bytes are to be pushed on the stack.
5846 unsigned NumBytes = CCInfo.getNextStackOffset();
5849 // Since we're not changing the ABI to make this a tail call, the memory
5850 // operands are already available in the caller's incoming argument space.
5854 // FPDiff is the byte offset of the call's argument area from the callee's.
5855 // Stores to callee stack arguments will be placed in FixedStackSlots offset
5856 // by this amount for a tail call. In a sibling call it must be 0 because the
5857 // caller will deallocate the entire stack and the callee still expects its
5858 // arguments to begin at SP+0. Completely unused for non-tail calls.
5861 if (IsTailCall && !IsSibCall) {
5862 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
5864 // Since callee will pop argument stack as a tail call, we must keep the
5865 // popped size 16-byte aligned.
5866 NumBytes = alignTo(NumBytes, 16);
5868 // FPDiff will be negative if this tail call requires more space than we
5869 // would automatically have in our incoming argument space. Positive if we
5870 // can actually shrink the stack.
5871 FPDiff = NumReusableBytes - NumBytes;
5873 // Update the required reserved area if this is the tail call requiring the
5874 // most argument stack space.
5875 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
5876 FuncInfo->setTailCallReservedStack(-FPDiff);
5878 // The stack pointer must be 16-byte aligned at all times it's used for a
5879 // memory operation, which in practice means at *all* times and in
5880 // particular across call boundaries. Therefore our own arguments started at
5881 // a 16-byte aligned SP and the delta applied for the tail call should
5882 // satisfy the same constraint.
5883 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
5886 // Adjust the stack pointer for the new arguments...
5887 // These operations are automatically eliminated by the prolog/epilog pass
5889 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
5891 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
5892 getPointerTy(DAG.getDataLayout()));
5894 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
5895 SmallSet<unsigned, 8> RegsUsed;
5896 SmallVector<SDValue, 8> MemOpChains;
5897 auto PtrVT = getPointerTy(DAG.getDataLayout());
5899 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
5900 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
5901 for (const auto &F : Forwards) {
5902 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
5903 RegsToPass.emplace_back(F.PReg, Val);
5907 // Walk the register/memloc assignments, inserting copies/loads.
5908 unsigned ExtraArgLocs = 0;
5909 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
5910 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
5911 SDValue Arg = OutVals[i];
5912 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5914 // Promote the value if needed.
5915 switch (VA.getLocInfo()) {
5917 llvm_unreachable("Unknown loc info!");
5918 case CCValAssign::Full:
5920 case CCValAssign::SExt:
5921 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
5923 case CCValAssign::ZExt:
5924 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
5926 case CCValAssign::AExt:
5927 if (Outs[i].ArgVT == MVT::i1) {
5928 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
5929 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
5930 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
5932 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5934 case CCValAssign::AExtUpper:
5935 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
5936 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5937 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
5938 DAG.getConstant(32, DL, VA.getLocVT()));
5940 case CCValAssign::BCvt:
5941 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
5943 case CCValAssign::Trunc:
5944 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
5946 case CCValAssign::FPExt:
5947 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
5949 case CCValAssign::Indirect:
5950 assert(VA.getValVT().isScalableVector() &&
5951 "Only scalable vectors can be passed indirectly");
5953 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
5954 uint64_t PartSize = StoreSize;
5955 unsigned NumParts = 1;
5956 if (Outs[i].Flags.isInConsecutiveRegs()) {
5957 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
5958 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
5960 StoreSize *= NumParts;
5963 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
5964 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
5965 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
5966 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
5967 MFI.setStackID(FI, TargetStackID::ScalableVector);
5969 MachinePointerInfo MPI =
5970 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
5971 SDValue Ptr = DAG.getFrameIndex(
5972 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
5973 SDValue SpillSlot = Ptr;
5975 // Ensure we generate all stores for each tuple part, whilst updating the
5976 // pointer after each store correctly using vscale.
5978 Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
5981 SDValue BytesIncrement = DAG.getVScale(
5982 DL, Ptr.getValueType(),
5983 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
5985 Flags.setNoUnsignedWrap(true);
5987 MPI = MachinePointerInfo(MPI.getAddrSpace());
5988 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5989 BytesIncrement, Flags);
5999 if (VA.isRegLoc()) {
6000 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
6001 Outs[0].VT == MVT::i64) {
6002 assert(VA.getLocVT() == MVT::i64 &&
6003 "unexpected calling convention register assignment");
6004 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
6005 "unexpected use of 'returned'");
6006 IsThisReturn = true;
6008 if (RegsUsed.count(VA.getLocReg())) {
6009 // If this register has already been used then we're trying to pack
6010 // parts of an [N x i32] into an X-register. The extension type will
6011 // take care of putting the two halves in the right place but we have to
6014 llvm::find_if(RegsToPass,
6015 [=](const std::pair<unsigned, SDValue> &Elt) {
6016 return Elt.first == VA.getLocReg();
6019 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
6020 // Call site info is used for function's parameter entry value
6021 // tracking. For now we track only simple cases when parameter
6022 // is transferred through whole register.
6023 llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
6024 return ArgReg.Reg == VA.getLocReg();
6027 RegsToPass.emplace_back(VA.getLocReg(), Arg);
6028 RegsUsed.insert(VA.getLocReg());
6029 const TargetOptions &Options = DAG.getTarget().Options;
6030 if (Options.EmitCallSiteInfo)
6031 CSInfo.emplace_back(VA.getLocReg(), i);
6034 assert(VA.isMemLoc());
6037 MachinePointerInfo DstInfo;
6039 // FIXME: This works on big-endian for composite byvals, which are the
6040 // common case. It should also work for fundamental types too.
6041 uint32_t BEAlign = 0;
6043 if (VA.getLocInfo() == CCValAssign::Indirect ||
6044 VA.getLocInfo() == CCValAssign::Trunc)
6045 OpSize = VA.getLocVT().getFixedSizeInBits();
6047 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
6048 : VA.getValVT().getSizeInBits();
6049 OpSize = (OpSize + 7) / 8;
6050 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
6051 !Flags.isInConsecutiveRegs()) {
6053 BEAlign = 8 - OpSize;
6055 unsigned LocMemOffset = VA.getLocMemOffset();
6056 int32_t Offset = LocMemOffset + BEAlign;
6057 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
6058 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
6061 Offset = Offset + FPDiff;
6062 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
6064 DstAddr = DAG.getFrameIndex(FI, PtrVT);
6066 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
6068 // Make sure any stack arguments overlapping with where we're storing
6069 // are loaded before this eventual operation. Otherwise they'll be
6071 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
6073 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
6075 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
6076 DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
6080 if (Outs[i].Flags.isByVal()) {
6082 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
6083 SDValue Cpy = DAG.getMemcpy(
6084 Chain, DL, DstAddr, Arg, SizeNode,
6085 Outs[i].Flags.getNonZeroByValAlign(),
6086 /*isVol = */ false, /*AlwaysInline = */ false,
6087 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
6089 MemOpChains.push_back(Cpy);
6091 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
6092 // promoted to a legal register type i32, we should truncate Arg back to
6094 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
6095 VA.getValVT() == MVT::i16)
6096 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
6098 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
6099 MemOpChains.push_back(Store);
6104 if (!MemOpChains.empty())
6105 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
6107 // Build a sequence of copy-to-reg nodes chained together with token chain
6108 // and flag operands which copy the outgoing args into the appropriate regs.
6110 for (auto &RegToPass : RegsToPass) {
6111 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
6112 RegToPass.second, InFlag);
6113 InFlag = Chain.getValue(1);
6116 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
6117 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
6118 // node so that legalize doesn't hack it.
6119 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
6120 auto GV = G->getGlobal();
6122 Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
6123 if (OpFlags & AArch64II::MO_GOT) {
6124 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
6125 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
6127 const GlobalValue *GV = G->getGlobal();
6128 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
6130 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
6131 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
6132 Subtarget->isTargetMachO()) {
6133 const char *Sym = S->getSymbol();
6134 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
6135 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
6137 const char *Sym = S->getSymbol();
6138 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
6142 // We don't usually want to end the call-sequence here because we would tidy
6143 // the frame up *after* the call, however in the ABI-changing tail-call case
6144 // we've carefully laid out the parameters so that when sp is reset they'll be
6145 // in the correct location.
6146 if (IsTailCall && !IsSibCall) {
6147 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
6148 DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
6149 InFlag = Chain.getValue(1);
6152 std::vector<SDValue> Ops;
6153 Ops.push_back(Chain);
6154 Ops.push_back(Callee);
6157 // Each tail call may have to adjust the stack by a different amount, so
6158 // this information must travel along with the operation for eventual
6159 // consumption by emitEpilogue.
6160 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
6163 // Add argument registers to the end of the list so that they are known live
6165 for (auto &RegToPass : RegsToPass)
6166 Ops.push_back(DAG.getRegister(RegToPass.first,
6167 RegToPass.second.getValueType()));
6169 // Add a register mask operand representing the call-preserved registers.
6170 const uint32_t *Mask;
6171 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6173 // For 'this' returns, use the X0-preserving mask if applicable
6174 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
6176 IsThisReturn = false;
6177 Mask = TRI->getCallPreservedMask(MF, CallConv);
6180 Mask = TRI->getCallPreservedMask(MF, CallConv);
6182 if (Subtarget->hasCustomCallingConv())
6183 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
6185 if (TRI->isAnyArgRegReserved(MF))
6186 TRI->emitReservedArgRegCallError(MF);
6188 assert(Mask && "Missing call preserved mask for calling convention");
6189 Ops.push_back(DAG.getRegisterMask(Mask));
6191 if (InFlag.getNode())
6192 Ops.push_back(InFlag);
6194 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
6196 // If we're doing a tall call, use a TC_RETURN here rather than an
6197 // actual call instruction.
6199 MF.getFrameInfo().setHasTailCall();
6200 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
6201 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
6205 unsigned CallOpc = AArch64ISD::CALL;
6206 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
6207 // be expanded to the call, directly followed by a special marker sequence.
6208 // Use the CALL_RVMARKER to do that.
6209 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
6210 assert(!IsTailCall &&
6211 "tail calls cannot be marked with clang.arc.attachedcall");
6212 CallOpc = AArch64ISD::CALL_RVMARKER;
6215 // Returns a chain and a flag for retval copy to use.
6216 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
6217 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
6218 InFlag = Chain.getValue(1);
6219 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
6221 uint64_t CalleePopBytes =
6222 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
6224 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
6225 DAG.getIntPtrConstant(CalleePopBytes, DL, true),
6228 InFlag = Chain.getValue(1);
6230 // Handle result values, copying them out of physregs into vregs that we
6232 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
6233 InVals, IsThisReturn,
6234 IsThisReturn ? OutVals[0] : SDValue());
6237 bool AArch64TargetLowering::CanLowerReturn(
6238 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
6239 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
6240 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
6241 SmallVector<CCValAssign, 16> RVLocs;
6242 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
6243 return CCInfo.CheckReturn(Outs, RetCC);
6247 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
6249 const SmallVectorImpl<ISD::OutputArg> &Outs,
6250 const SmallVectorImpl<SDValue> &OutVals,
6251 const SDLoc &DL, SelectionDAG &DAG) const {
6252 auto &MF = DAG.getMachineFunction();
6253 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6255 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
6256 SmallVector<CCValAssign, 16> RVLocs;
6257 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
6259 CCInfo.AnalyzeReturn(Outs, RetCC);
6261 // Copy the result values into the output registers.
6263 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
6264 SmallSet<unsigned, 4> RegsUsed;
6265 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
6266 ++i, ++realRVLocIdx) {
6267 CCValAssign &VA = RVLocs[i];
6268 assert(VA.isRegLoc() && "Can only return in registers!");
6269 SDValue Arg = OutVals[realRVLocIdx];
6271 switch (VA.getLocInfo()) {
6273 llvm_unreachable("Unknown loc info!");
6274 case CCValAssign::Full:
6275 if (Outs[i].ArgVT == MVT::i1) {
6276 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
6277 // value. This is strictly redundant on Darwin (which uses "zeroext
6278 // i1"), but will be optimised out before ISel.
6279 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
6280 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
6283 case CCValAssign::BCvt:
6284 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
6286 case CCValAssign::AExt:
6287 case CCValAssign::ZExt:
6288 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
6290 case CCValAssign::AExtUpper:
6291 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
6292 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
6293 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
6294 DAG.getConstant(32, DL, VA.getLocVT()));
6298 if (RegsUsed.count(VA.getLocReg())) {
6300 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
6301 return Elt.first == VA.getLocReg();
6303 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
6305 RetVals.emplace_back(VA.getLocReg(), Arg);
6306 RegsUsed.insert(VA.getLocReg());
6310 SmallVector<SDValue, 4> RetOps(1, Chain);
6311 for (auto &RetVal : RetVals) {
6312 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
6313 Flag = Chain.getValue(1);
6315 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
6318 // Windows AArch64 ABIs require that for returning structs by value we copy
6319 // the sret argument into X0 for the return.
6320 // We saved the argument into a virtual register in the entry block,
6321 // so now we copy the value out and into X0.
6322 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
6323 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
6324 getPointerTy(MF.getDataLayout()));
6326 unsigned RetValReg = AArch64::X0;
6327 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
6328 Flag = Chain.getValue(1);
6331 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
6334 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6335 const MCPhysReg *I =
6336 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
6339 if (AArch64::GPR64RegClass.contains(*I))
6340 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
6341 else if (AArch64::FPR64RegClass.contains(*I))
6342 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
6344 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
6348 RetOps[0] = Chain; // Update chain.
6350 // Add the flag if we have it.
6352 RetOps.push_back(Flag);
6354 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
6357 //===----------------------------------------------------------------------===//
6358 // Other Lowering Code
6359 //===----------------------------------------------------------------------===//
6361 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
6363 unsigned Flag) const {
6364 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
6365 N->getOffset(), Flag);
6368 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
6370 unsigned Flag) const {
6371 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
6374 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
6376 unsigned Flag) const {
6377 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
6378 N->getOffset(), Flag);
6381 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
6383 unsigned Flag) const {
6384 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
6388 template <class NodeTy>
6389 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
6390 unsigned Flags) const {
6391 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
6393 EVT Ty = getPointerTy(DAG.getDataLayout());
6394 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
6395 // FIXME: Once remat is capable of dealing with instructions with register
6396 // operands, expand this into two nodes instead of using a wrapper node.
6397 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
6400 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
6401 template <class NodeTy>
6402 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
6403 unsigned Flags) const {
6404 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
6406 EVT Ty = getPointerTy(DAG.getDataLayout());
6407 const unsigned char MO_NC = AArch64II::MO_NC;
6409 AArch64ISD::WrapperLarge, DL, Ty,
6410 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
6411 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
6412 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
6413 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
6416 // (addlow (adrp %hi(sym)) %lo(sym))
6417 template <class NodeTy>
6418 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
6419 unsigned Flags) const {
6420 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
6422 EVT Ty = getPointerTy(DAG.getDataLayout());
6423 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
6424 SDValue Lo = getTargetNode(N, Ty, DAG,
6425 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
6426 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
6427 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
6431 template <class NodeTy>
6432 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
6433 unsigned Flags) const {
6434 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
6436 EVT Ty = getPointerTy(DAG.getDataLayout());
6437 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
6438 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
6441 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
6442 SelectionDAG &DAG) const {
6443 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
6444 const GlobalValue *GV = GN->getGlobal();
6445 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
6447 if (OpFlags != AArch64II::MO_NO_FLAG)
6448 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
6449 "unexpected offset in global node");
6451 // This also catches the large code model case for Darwin, and tiny code
6452 // model with got relocations.
6453 if ((OpFlags & AArch64II::MO_GOT) != 0) {
6454 return getGOT(GN, DAG, OpFlags);
6458 if (getTargetMachine().getCodeModel() == CodeModel::Large) {
6459 Result = getAddrLarge(GN, DAG, OpFlags);
6460 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6461 Result = getAddrTiny(GN, DAG, OpFlags);
6463 Result = getAddr(GN, DAG, OpFlags);
6465 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6467 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
6468 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
6469 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
6473 /// Convert a TLS address reference into the correct sequence of loads
6474 /// and calls to compute the variable's address (for Darwin, currently) and
6475 /// return an SDValue containing the final node.
6477 /// Darwin only has one TLS scheme which must be capable of dealing with the
6478 /// fully general situation, in the worst case. This means:
6479 /// + "extern __thread" declaration.
6480 /// + Defined in a possibly unknown dynamic library.
6482 /// The general system is that each __thread variable has a [3 x i64] descriptor
6483 /// which contains information used by the runtime to calculate the address. The
6484 /// only part of this the compiler needs to know about is the first xword, which
6485 /// contains a function pointer that must be called with the address of the
6486 /// entire descriptor in "x0".
6488 /// Since this descriptor may be in a different unit, in general even the
6489 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
6491 /// adrp x0, _var@TLVPPAGE
6492 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
6493 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
6494 /// ; the function pointer
6495 /// blr x1 ; Uses descriptor address in x0
6496 /// ; Address of _var is now in x0.
6498 /// If the address of _var's descriptor *is* known to the linker, then it can
6499 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
6500 /// a slight efficiency gain.
6502 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
6503 SelectionDAG &DAG) const {
6504 assert(Subtarget->isTargetDarwin() &&
6505 "This function expects a Darwin target");
6508 MVT PtrVT = getPointerTy(DAG.getDataLayout());
6509 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
6510 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
6513 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
6514 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
6516 // The first entry in the descriptor is a function pointer that we must call
6517 // to obtain the address of the variable.
6518 SDValue Chain = DAG.getEntryNode();
6519 SDValue FuncTLVGet = DAG.getLoad(
6520 PtrMemVT, DL, Chain, DescAddr,
6521 MachinePointerInfo::getGOT(DAG.getMachineFunction()),
6522 Align(PtrMemVT.getSizeInBits() / 8),
6523 MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
6524 Chain = FuncTLVGet.getValue(1);
6526 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
6527 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
6529 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6530 MFI.setAdjustsStack(true);
6532 // TLS calls preserve all registers except those that absolutely must be
6533 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
6535 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6536 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
6537 if (Subtarget->hasCustomCallingConv())
6538 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
6540 // Finally, we can make the call. This is just a degenerate version of a
6541 // normal AArch64 call node: x0 takes the address of the descriptor, and
6542 // returns the address of the variable in this thread.
6543 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
6545 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6546 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
6547 DAG.getRegisterMask(Mask), Chain.getValue(1));
6548 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
6551 /// Convert a thread-local variable reference into a sequence of instructions to
6552 /// compute the variable's address for the local exec TLS model of ELF targets.
6553 /// The sequence depends on the maximum TLS area size.
6554 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
6557 SelectionDAG &DAG) const {
6558 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6559 SDValue TPOff, Addr;
6561 switch (DAG.getTarget().Options.TLSSize) {
6563 llvm_unreachable("Unexpected TLS size");
6566 // mrs x0, TPIDR_EL0
6567 // add x0, x0, :tprel_lo12:a
6568 SDValue Var = DAG.getTargetGlobalAddress(
6569 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
6570 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6572 DAG.getTargetConstant(0, DL, MVT::i32)),
6577 // mrs x0, TPIDR_EL0
6578 // add x0, x0, :tprel_hi12:a
6579 // add x0, x0, :tprel_lo12_nc:a
6580 SDValue HiVar = DAG.getTargetGlobalAddress(
6581 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
6582 SDValue LoVar = DAG.getTargetGlobalAddress(
6584 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6585 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6587 DAG.getTargetConstant(0, DL, MVT::i32)),
6589 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
6591 DAG.getTargetConstant(0, DL, MVT::i32)),
6596 // mrs x1, TPIDR_EL0
6597 // movz x0, #:tprel_g1:a
6598 // movk x0, #:tprel_g0_nc:a
6600 SDValue HiVar = DAG.getTargetGlobalAddress(
6601 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
6602 SDValue LoVar = DAG.getTargetGlobalAddress(
6604 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
6605 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6606 DAG.getTargetConstant(16, DL, MVT::i32)),
6608 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6609 DAG.getTargetConstant(0, DL, MVT::i32)),
6611 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6615 // mrs x1, TPIDR_EL0
6616 // movz x0, #:tprel_g2:a
6617 // movk x0, #:tprel_g1_nc:a
6618 // movk x0, #:tprel_g0_nc:a
6620 SDValue HiVar = DAG.getTargetGlobalAddress(
6621 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
6622 SDValue MiVar = DAG.getTargetGlobalAddress(
6624 AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
6625 SDValue LoVar = DAG.getTargetGlobalAddress(
6627 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
6628 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6629 DAG.getTargetConstant(32, DL, MVT::i32)),
6631 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
6632 DAG.getTargetConstant(16, DL, MVT::i32)),
6634 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6635 DAG.getTargetConstant(0, DL, MVT::i32)),
6637 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6642 /// When accessing thread-local variables under either the general-dynamic or
6643 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
6644 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
6645 /// is a function pointer to carry out the resolution.
6647 /// The sequence is:
6648 /// adrp x0, :tlsdesc:var
6649 /// ldr x1, [x0, #:tlsdesc_lo12:var]
6650 /// add x0, x0, #:tlsdesc_lo12:var
6651 /// .tlsdesccall var
6653 /// (TPIDR_EL0 offset now in x0)
6655 /// The above sequence must be produced unscheduled, to enable the linker to
6656 /// optimize/relax this sequence.
6657 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
6658 /// above sequence, and expanded really late in the compilation flow, to ensure
6659 /// the sequence is produced as per above.
6660 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
6662 SelectionDAG &DAG) const {
6663 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6665 SDValue Chain = DAG.getEntryNode();
6666 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
6669 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
6670 SDValue Glue = Chain.getValue(1);
6672 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
6676 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
6677 SelectionDAG &DAG) const {
6678 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
6680 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6682 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
6684 if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
6685 if (Model == TLSModel::LocalDynamic)
6686 Model = TLSModel::GeneralDynamic;
6689 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
6690 Model != TLSModel::LocalExec)
6691 report_fatal_error("ELF TLS only supported in small memory model or "
6692 "in local exec TLS model");
6693 // Different choices can be made for the maximum size of the TLS area for a
6694 // module. For the small address model, the default TLS size is 16MiB and the
6695 // maximum TLS size is 4GiB.
6696 // FIXME: add tiny and large code model support for TLS access models other
6697 // than local exec. We currently generate the same code as small for tiny,
6698 // which may be larger than needed.
6701 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6703 const GlobalValue *GV = GA->getGlobal();
6705 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6707 if (Model == TLSModel::LocalExec) {
6708 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
6709 } else if (Model == TLSModel::InitialExec) {
6710 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
6711 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
6712 } else if (Model == TLSModel::LocalDynamic) {
6713 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
6714 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
6715 // the beginning of the module's TLS region, followed by a DTPREL offset
6718 // These accesses will need deduplicating if there's more than one.
6719 AArch64FunctionInfo *MFI =
6720 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
6721 MFI->incNumLocalDynamicTLSAccesses();
6723 // The call needs a relocation too for linker relaxation. It doesn't make
6724 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6726 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
6729 // Now we can calculate the offset from TPIDR_EL0 to this module's
6730 // thread-local area.
6731 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6733 // Now use :dtprel_whatever: operations to calculate this variable's offset
6734 // in its thread-storage area.
6735 SDValue HiVar = DAG.getTargetGlobalAddress(
6736 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
6737 SDValue LoVar = DAG.getTargetGlobalAddress(
6738 GV, DL, MVT::i64, 0,
6739 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6741 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
6742 DAG.getTargetConstant(0, DL, MVT::i32)),
6744 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
6745 DAG.getTargetConstant(0, DL, MVT::i32)),
6747 } else if (Model == TLSModel::GeneralDynamic) {
6748 // The call needs a relocation too for linker relaxation. It doesn't make
6749 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6752 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
6754 // Finally we can make a call to calculate the offset from tpidr_el0.
6755 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6757 llvm_unreachable("Unsupported ELF TLS access model");
6759 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6763 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
6764 SelectionDAG &DAG) const {
6765 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
6767 SDValue Chain = DAG.getEntryNode();
6768 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6771 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
6773 // Load the ThreadLocalStoragePointer from the TEB
6774 // A pointer to the TLS array is located at offset 0x58 from the TEB.
6776 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
6777 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
6778 Chain = TLSArray.getValue(1);
6780 // Load the TLS index from the C runtime;
6781 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
6782 // This also does the same as LOADgot, but using a generic i32 load,
6783 // while LOADgot only loads i64.
6784 SDValue TLSIndexHi =
6785 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
6786 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
6787 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6788 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
6790 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
6791 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
6792 Chain = TLSIndex.getValue(1);
6794 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
6795 // offset into the TLSArray.
6796 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
6797 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
6798 DAG.getConstant(3, DL, PtrVT));
6799 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
6800 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
6801 MachinePointerInfo());
6802 Chain = TLS.getValue(1);
6804 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6805 const GlobalValue *GV = GA->getGlobal();
6806 SDValue TGAHi = DAG.getTargetGlobalAddress(
6807 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
6808 SDValue TGALo = DAG.getTargetGlobalAddress(
6810 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6812 // Add the offset from the start of the .tls section (section base).
6814 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
6815 DAG.getTargetConstant(0, DL, MVT::i32)),
6817 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
6821 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
6822 SelectionDAG &DAG) const {
6823 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6824 if (DAG.getTarget().useEmulatedTLS())
6825 return LowerToTLSEmulatedModel(GA, DAG);
6827 if (Subtarget->isTargetDarwin())
6828 return LowerDarwinGlobalTLSAddress(Op, DAG);
6829 if (Subtarget->isTargetELF())
6830 return LowerELFGlobalTLSAddress(Op, DAG);
6831 if (Subtarget->isTargetWindows())
6832 return LowerWindowsGlobalTLSAddress(Op, DAG);
6834 llvm_unreachable("Unexpected platform trying to use TLS");
6837 // Looks through \param Val to determine the bit that can be used to
6838 // check the sign of the value. It returns the unextended value and
6839 // the sign bit position.
6840 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
6841 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
6842 return {Val.getOperand(0),
6843 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
6846 if (Val.getOpcode() == ISD::SIGN_EXTEND)
6847 return {Val.getOperand(0),
6848 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
6850 return {Val, Val.getValueSizeInBits() - 1};
6853 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
6854 SDValue Chain = Op.getOperand(0);
6855 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
6856 SDValue LHS = Op.getOperand(2);
6857 SDValue RHS = Op.getOperand(3);
6858 SDValue Dest = Op.getOperand(4);
6861 MachineFunction &MF = DAG.getMachineFunction();
6862 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
6863 // will not be produced, as they are conditional branch instructions that do
6865 bool ProduceNonFlagSettingCondBr =
6866 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
6868 // Handle f128 first, since lowering it will result in comparing the return
6869 // value of a libcall against zero, which is just what the rest of LowerBR_CC
6870 // is expecting to deal with.
6871 if (LHS.getValueType() == MVT::f128) {
6872 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
6874 // If softenSetCCOperands returned a scalar, we need to compare the result
6875 // against zero to select between true and false values.
6876 if (!RHS.getNode()) {
6877 RHS = DAG.getConstant(0, dl, LHS.getValueType());
6882 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
6884 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
6885 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6886 // Only lower legal XALUO ops.
6887 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
6890 // The actual operation with overflow check.
6891 AArch64CC::CondCode OFCC;
6892 SDValue Value, Overflow;
6893 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
6895 if (CC == ISD::SETNE)
6896 OFCC = getInvertedCondCode(OFCC);
6897 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
6899 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6903 if (LHS.getValueType().isInteger()) {
6904 assert((LHS.getValueType() == RHS.getValueType()) &&
6905 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
6907 // If the RHS of the comparison is zero, we can potentially fold this
6908 // to a specialized branch.
6909 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
6910 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
6911 if (CC == ISD::SETEQ) {
6912 // See if we can use a TBZ to fold in an AND as well.
6913 // TBZ has a smaller branch displacement than CBZ. If the offset is
6914 // out of bounds, a late MI-layer pass rewrites branches.
6915 // 403.gcc is an example that hits this case.
6916 if (LHS.getOpcode() == ISD::AND &&
6917 isa<ConstantSDNode>(LHS.getOperand(1)) &&
6918 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6919 SDValue Test = LHS.getOperand(0);
6920 uint64_t Mask = LHS.getConstantOperandVal(1);
6921 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
6922 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6926 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
6927 } else if (CC == ISD::SETNE) {
6928 // See if we can use a TBZ to fold in an AND as well.
6929 // TBZ has a smaller branch displacement than CBZ. If the offset is
6930 // out of bounds, a late MI-layer pass rewrites branches.
6931 // 403.gcc is an example that hits this case.
6932 if (LHS.getOpcode() == ISD::AND &&
6933 isa<ConstantSDNode>(LHS.getOperand(1)) &&
6934 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6935 SDValue Test = LHS.getOperand(0);
6936 uint64_t Mask = LHS.getConstantOperandVal(1);
6937 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
6938 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6942 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
6943 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
6944 // Don't combine AND since emitComparison converts the AND to an ANDS
6945 // (a.k.a. TST) and the test in the test bit and branch instruction
6946 // becomes redundant. This would also increase register pressure.
6947 uint64_t SignBitPos;
6948 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6949 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
6950 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6953 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
6954 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
6955 // Don't combine AND since emitComparison converts the AND to an ANDS
6956 // (a.k.a. TST) and the test in the test bit and branch instruction
6957 // becomes redundant. This would also increase register pressure.
6958 uint64_t SignBitPos;
6959 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6960 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
6961 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6965 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
6966 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6970 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
6971 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
6973 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
6974 // clean. Some of them require two branches to implement.
6975 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
6976 AArch64CC::CondCode CC1, CC2;
6977 changeFPCCToAArch64CC(CC, CC1, CC2);
6978 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
6980 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
6981 if (CC2 != AArch64CC::AL) {
6982 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
6983 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
6990 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
6991 SelectionDAG &DAG) const {
6992 EVT VT = Op.getValueType();
6995 SDValue In1 = Op.getOperand(0);
6996 SDValue In2 = Op.getOperand(1);
6997 EVT SrcVT = In2.getValueType();
6999 if (SrcVT.bitsLT(VT))
7000 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
7001 else if (SrcVT.bitsGT(VT))
7002 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
7006 SDValue VecVal1, VecVal2;
7008 auto setVecVal = [&] (int Idx) {
7009 if (!VT.isVector()) {
7010 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
7011 DAG.getUNDEF(VecVT), In1);
7012 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
7013 DAG.getUNDEF(VecVT), In2);
7015 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
7016 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
7020 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
7021 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
7022 EltMask = 0x80000000ULL;
7023 setVecVal(AArch64::ssub);
7024 } else if (VT == MVT::f64 || VT == MVT::v2f64) {
7027 // We want to materialize a mask with the high bit set, but the AdvSIMD
7028 // immediate moves cannot materialize that in a single instruction for
7029 // 64-bit elements. Instead, materialize zero and then negate it.
7032 setVecVal(AArch64::dsub);
7033 } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
7034 VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
7035 EltMask = 0x8000ULL;
7036 setVecVal(AArch64::hsub);
7038 llvm_unreachable("Invalid type for copysign!");
7041 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
7043 // If we couldn't materialize the mask above, then the mask vector will be
7044 // the zero vector, and we need to negate it here.
7045 if (VT == MVT::f64 || VT == MVT::v2f64) {
7046 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
7047 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
7048 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
7052 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
7055 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
7057 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
7058 else if (VT == MVT::f64)
7059 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
7061 return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
7064 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
7065 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
7066 Attribute::NoImplicitFloat))
7069 if (!Subtarget->hasNEON())
7072 // While there is no integer popcount instruction, it can
7073 // be more efficiently lowered to the following sequence that uses
7074 // AdvSIMD registers/instructions as long as the copies to/from
7075 // the AdvSIMD registers are cheap.
7076 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
7077 // CNT V0.8B, V0.8B // 8xbyte pop-counts
7078 // ADDV B0, V0.8B // sum 8xbyte pop-counts
7079 // UMOV X0, V0.B[0] // copy byte result back to integer reg
7080 SDValue Val = Op.getOperand(0);
7082 EVT VT = Op.getValueType();
7084 if (VT == MVT::i32 || VT == MVT::i64) {
7086 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
7087 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
7089 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
7090 SDValue UaddLV = DAG.getNode(
7091 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
7092 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
7095 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
7097 } else if (VT == MVT::i128) {
7098 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
7100 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
7101 SDValue UaddLV = DAG.getNode(
7102 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
7103 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
7105 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
7108 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
7109 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
7111 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
7112 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
7113 "Unexpected type for custom ctpop lowering");
7115 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
7116 Val = DAG.getBitcast(VT8Bit, Val);
7117 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
7119 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
7120 unsigned EltSize = 8;
7121 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
7122 while (EltSize != VT.getScalarSizeInBits()) {
7125 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
7127 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
7128 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
7134 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
7135 EVT VT = Op.getValueType();
7136 assert(VT.isScalableVector() ||
7137 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true));
7140 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
7141 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
7144 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
7145 SelectionDAG &DAG) const {
7146 EVT VT = Op.getValueType();
7148 if (VT.isScalableVector() ||
7149 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7150 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
7157 switch (VT.getSimpleVT().SimpleTy) {
7159 llvm_unreachable("Invalid type for bitreverse!");
7163 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
7170 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
7177 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
7184 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
7190 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
7191 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
7194 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
7196 if (Op.getValueType().isVector())
7197 return LowerVSETCC(Op, DAG);
7199 bool IsStrict = Op->isStrictFPOpcode();
7200 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
7201 unsigned OpNo = IsStrict ? 1 : 0;
7204 Chain = Op.getOperand(0);
7205 SDValue LHS = Op.getOperand(OpNo + 0);
7206 SDValue RHS = Op.getOperand(OpNo + 1);
7207 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
7210 // We chose ZeroOrOneBooleanContents, so use zero and one.
7211 EVT VT = Op.getValueType();
7212 SDValue TVal = DAG.getConstant(1, dl, VT);
7213 SDValue FVal = DAG.getConstant(0, dl, VT);
7215 // Handle f128 first, since one possible outcome is a normal integer
7216 // comparison which gets picked up by the next if statement.
7217 if (LHS.getValueType() == MVT::f128) {
7218 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
7221 // If softenSetCCOperands returned a scalar, use it.
7222 if (!RHS.getNode()) {
7223 assert(LHS.getValueType() == Op.getValueType() &&
7224 "Unexpected setcc expansion!");
7225 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
7229 if (LHS.getValueType().isInteger()) {
7231 SDValue Cmp = getAArch64Cmp(
7232 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
7234 // Note that we inverted the condition above, so we reverse the order of
7235 // the true and false operands here. This will allow the setcc to be
7236 // matched to a single CSINC instruction.
7237 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
7238 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
7241 // Now we know we're dealing with FP values.
7242 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
7243 LHS.getValueType() == MVT::f64);
7245 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
7246 // and do the comparison.
7249 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
7251 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
7253 AArch64CC::CondCode CC1, CC2;
7254 changeFPCCToAArch64CC(CC, CC1, CC2);
7256 if (CC2 == AArch64CC::AL) {
7257 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
7259 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
7261 // Note that we inverted the condition above, so we reverse the order of
7262 // the true and false operands here. This will allow the setcc to be
7263 // matched to a single CSINC instruction.
7264 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
7266 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
7267 // totally clean. Some of them require two CSELs to implement. As is in
7268 // this case, we emit the first CSEL and then emit a second using the output
7269 // of the first as the RHS. We're effectively OR'ing the two CC's together.
7271 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
7272 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
7274 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
7276 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
7277 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
7279 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
7282 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
7283 SDValue RHS, SDValue TVal,
7284 SDValue FVal, const SDLoc &dl,
7285 SelectionDAG &DAG) const {
7286 // Handle f128 first, because it will result in a comparison of some RTLIB
7287 // call result against zero.
7288 if (LHS.getValueType() == MVT::f128) {
7289 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
7291 // If softenSetCCOperands returned a scalar, we need to compare the result
7292 // against zero to select between true and false values.
7293 if (!RHS.getNode()) {
7294 RHS = DAG.getConstant(0, dl, LHS.getValueType());
7299 // Also handle f16, for which we need to do a f32 comparison.
7300 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
7301 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
7302 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
7305 // Next, handle integers.
7306 if (LHS.getValueType().isInteger()) {
7307 assert((LHS.getValueType() == RHS.getValueType()) &&
7308 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
7310 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
7311 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
7312 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
7313 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
7314 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
7316 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnesValue() && CTVal && CFVal &&
7317 CTVal->isOne() && CFVal->isAllOnesValue() &&
7318 LHS.getValueType() == TVal.getValueType()) {
7319 EVT VT = LHS.getValueType();
7321 DAG.getNode(ISD::SRA, dl, VT, LHS,
7322 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
7323 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
7326 unsigned Opcode = AArch64ISD::CSEL;
7328 // If both the TVal and the FVal are constants, see if we can swap them in
7329 // order to for a CSINV or CSINC out of them.
7330 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
7331 std::swap(TVal, FVal);
7332 std::swap(CTVal, CFVal);
7333 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7334 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
7335 std::swap(TVal, FVal);
7336 std::swap(CTVal, CFVal);
7337 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7338 } else if (TVal.getOpcode() == ISD::XOR) {
7339 // If TVal is a NOT we want to swap TVal and FVal so that we can match
7340 // with a CSINV rather than a CSEL.
7341 if (isAllOnesConstant(TVal.getOperand(1))) {
7342 std::swap(TVal, FVal);
7343 std::swap(CTVal, CFVal);
7344 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7346 } else if (TVal.getOpcode() == ISD::SUB) {
7347 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
7348 // that we can match with a CSNEG rather than a CSEL.
7349 if (isNullConstant(TVal.getOperand(0))) {
7350 std::swap(TVal, FVal);
7351 std::swap(CTVal, CFVal);
7352 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7354 } else if (CTVal && CFVal) {
7355 const int64_t TrueVal = CTVal->getSExtValue();
7356 const int64_t FalseVal = CFVal->getSExtValue();
7359 // If both TVal and FVal are constants, see if FVal is the
7360 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
7361 // instead of a CSEL in that case.
7362 if (TrueVal == ~FalseVal) {
7363 Opcode = AArch64ISD::CSINV;
7364 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
7365 TrueVal == -FalseVal) {
7366 Opcode = AArch64ISD::CSNEG;
7367 } else if (TVal.getValueType() == MVT::i32) {
7368 // If our operands are only 32-bit wide, make sure we use 32-bit
7369 // arithmetic for the check whether we can use CSINC. This ensures that
7370 // the addition in the check will wrap around properly in case there is
7371 // an overflow (which would not be the case if we do the check with
7372 // 64-bit arithmetic).
7373 const uint32_t TrueVal32 = CTVal->getZExtValue();
7374 const uint32_t FalseVal32 = CFVal->getZExtValue();
7376 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
7377 Opcode = AArch64ISD::CSINC;
7379 if (TrueVal32 > FalseVal32) {
7383 // 64-bit check whether we can use CSINC.
7384 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
7385 Opcode = AArch64ISD::CSINC;
7387 if (TrueVal > FalseVal) {
7392 // Swap TVal and FVal if necessary.
7394 std::swap(TVal, FVal);
7395 std::swap(CTVal, CFVal);
7396 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7399 if (Opcode != AArch64ISD::CSEL) {
7400 // Drop FVal since we can get its value by simply inverting/negating
7406 // Avoid materializing a constant when possible by reusing a known value in
7407 // a register. However, don't perform this optimization if the known value
7408 // is one, zero or negative one in the case of a CSEL. We can always
7409 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
7410 // FVal, respectively.
7411 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
7412 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
7413 !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
7414 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
7415 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
7416 // "a != C ? x : a" to avoid materializing C.
7417 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
7419 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
7421 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
7422 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
7423 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
7424 // avoid materializing C.
7425 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
7426 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
7427 Opcode = AArch64ISD::CSINV;
7429 FVal = DAG.getConstant(0, dl, FVal.getValueType());
7434 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
7435 EVT VT = TVal.getValueType();
7436 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
7439 // Now we know we're dealing with FP values.
7440 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
7441 LHS.getValueType() == MVT::f64);
7442 assert(LHS.getValueType() == RHS.getValueType());
7443 EVT VT = TVal.getValueType();
7444 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
7446 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
7447 // clean. Some of them require two CSELs to implement.
7448 AArch64CC::CondCode CC1, CC2;
7449 changeFPCCToAArch64CC(CC, CC1, CC2);
7451 if (DAG.getTarget().Options.UnsafeFPMath) {
7452 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
7453 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
7454 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
7455 if (RHSVal && RHSVal->isZero()) {
7456 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
7457 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
7459 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
7460 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
7462 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
7463 CFVal && CFVal->isZero() &&
7464 FVal.getValueType() == LHS.getValueType())
7469 // Emit first, and possibly only, CSEL.
7470 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
7471 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
7473 // If we need a second CSEL, emit it, using the output of the first as the
7474 // RHS. We're effectively OR'ing the two CC's together.
7475 if (CC2 != AArch64CC::AL) {
7476 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
7477 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
7480 // Otherwise, return the output of the first CSEL.
7484 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
7485 SelectionDAG &DAG) const {
7487 EVT Ty = Op.getValueType();
7488 auto Idx = Op.getConstantOperandAPInt(2);
7489 if (Idx.sge(-1) && Idx.slt(Ty.getVectorMinNumElements()))
7494 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
7495 SelectionDAG &DAG) const {
7496 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
7497 SDValue LHS = Op.getOperand(0);
7498 SDValue RHS = Op.getOperand(1);
7499 SDValue TVal = Op.getOperand(2);
7500 SDValue FVal = Op.getOperand(3);
7502 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
7505 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
7506 SelectionDAG &DAG) const {
7507 SDValue CCVal = Op->getOperand(0);
7508 SDValue TVal = Op->getOperand(1);
7509 SDValue FVal = Op->getOperand(2);
7512 EVT Ty = Op.getValueType();
7513 if (Ty.isScalableVector()) {
7514 SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
7515 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
7516 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
7517 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
7520 if (useSVEForFixedLengthVectorVT(Ty)) {
7521 // FIXME: Ideally this would be the same as above using i1 types, however
7522 // for the moment we can't deal with fixed i1 vector types properly, so
7523 // instead extend the predicate to a result type sized integer vector.
7524 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
7525 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
7526 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
7527 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
7528 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
7531 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
7533 if (ISD::isOverflowIntrOpRes(CCVal)) {
7534 // Only lower legal XALUO ops.
7535 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
7538 AArch64CC::CondCode OFCC;
7539 SDValue Value, Overflow;
7540 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
7541 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
7543 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
7547 // Lower it the same way as we would lower a SELECT_CC node.
7550 if (CCVal.getOpcode() == ISD::SETCC) {
7551 LHS = CCVal.getOperand(0);
7552 RHS = CCVal.getOperand(1);
7553 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
7556 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
7559 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
7562 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
7563 SelectionDAG &DAG) const {
7564 // Jump table entries as PC relative offsets. No additional tweaking
7565 // is necessary here. Just get the address of the jump table.
7566 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
7568 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7569 !Subtarget->isTargetMachO()) {
7570 return getAddrLarge(JT, DAG);
7571 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7572 return getAddrTiny(JT, DAG);
7574 return getAddr(JT, DAG);
7577 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
7578 SelectionDAG &DAG) const {
7579 // Jump table entries as PC relative offsets. No additional tweaking
7580 // is necessary here. Just get the address of the jump table.
7582 SDValue JT = Op.getOperand(1);
7583 SDValue Entry = Op.getOperand(2);
7584 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
7586 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7587 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
7590 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
7591 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
7592 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
7596 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
7597 SelectionDAG &DAG) const {
7598 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
7600 if (getTargetMachine().getCodeModel() == CodeModel::Large) {
7601 // Use the GOT for the large code model on iOS.
7602 if (Subtarget->isTargetMachO()) {
7603 return getGOT(CP, DAG);
7605 return getAddrLarge(CP, DAG);
7606 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7607 return getAddrTiny(CP, DAG);
7609 return getAddr(CP, DAG);
7613 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
7614 SelectionDAG &DAG) const {
7615 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
7616 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7617 !Subtarget->isTargetMachO()) {
7618 return getAddrLarge(BA, DAG);
7619 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7620 return getAddrTiny(BA, DAG);
7622 return getAddr(BA, DAG);
7625 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
7626 SelectionDAG &DAG) const {
7627 AArch64FunctionInfo *FuncInfo =
7628 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7631 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
7632 getPointerTy(DAG.getDataLayout()));
7633 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
7634 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7635 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
7636 MachinePointerInfo(SV));
7639 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
7640 SelectionDAG &DAG) const {
7641 AArch64FunctionInfo *FuncInfo =
7642 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7645 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
7646 ? FuncInfo->getVarArgsGPRIndex()
7647 : FuncInfo->getVarArgsStackIndex(),
7648 getPointerTy(DAG.getDataLayout()));
7649 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7650 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
7651 MachinePointerInfo(SV));
7654 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
7655 SelectionDAG &DAG) const {
7656 // The layout of the va_list struct is specified in the AArch64 Procedure Call
7657 // Standard, section B.3.
7658 MachineFunction &MF = DAG.getMachineFunction();
7659 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7660 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7661 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
7662 auto PtrVT = getPointerTy(DAG.getDataLayout());
7665 SDValue Chain = Op.getOperand(0);
7666 SDValue VAList = Op.getOperand(1);
7667 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7668 SmallVector<SDValue, 4> MemOps;
7670 // void *__stack at offset 0
7671 unsigned Offset = 0;
7672 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
7673 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
7674 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
7675 MachinePointerInfo(SV), Align(PtrSize)));
7677 // void *__gr_top at offset 8 (4 on ILP32)
7679 int GPRSize = FuncInfo->getVarArgsGPRSize();
7681 SDValue GRTop, GRTopAddr;
7683 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7684 DAG.getConstant(Offset, DL, PtrVT));
7686 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
7687 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
7688 DAG.getConstant(GPRSize, DL, PtrVT));
7689 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
7691 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
7692 MachinePointerInfo(SV, Offset),
7696 // void *__vr_top at offset 16 (8 on ILP32)
7698 int FPRSize = FuncInfo->getVarArgsFPRSize();
7700 SDValue VRTop, VRTopAddr;
7701 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7702 DAG.getConstant(Offset, DL, PtrVT));
7704 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
7705 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
7706 DAG.getConstant(FPRSize, DL, PtrVT));
7707 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
7709 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
7710 MachinePointerInfo(SV, Offset),
7714 // int __gr_offs at offset 24 (12 on ILP32)
7716 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7717 DAG.getConstant(Offset, DL, PtrVT));
7719 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
7720 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
7722 // int __vr_offs at offset 28 (16 on ILP32)
7724 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7725 DAG.getConstant(Offset, DL, PtrVT));
7727 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
7728 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
7730 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7733 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
7734 SelectionDAG &DAG) const {
7735 MachineFunction &MF = DAG.getMachineFunction();
7737 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
7738 return LowerWin64_VASTART(Op, DAG);
7739 else if (Subtarget->isTargetDarwin())
7740 return LowerDarwin_VASTART(Op, DAG);
7742 return LowerAAPCS_VASTART(Op, DAG);
7745 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
7746 SelectionDAG &DAG) const {
7747 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
7750 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7751 unsigned VaListSize =
7752 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7754 : Subtarget->isTargetILP32() ? 20 : 32;
7755 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
7756 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7758 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
7759 DAG.getConstant(VaListSize, DL, MVT::i32),
7760 Align(PtrSize), false, false, false,
7761 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
7764 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
7765 assert(Subtarget->isTargetDarwin() &&
7766 "automatic va_arg instruction only works on Darwin");
7768 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7769 EVT VT = Op.getValueType();
7771 SDValue Chain = Op.getOperand(0);
7772 SDValue Addr = Op.getOperand(1);
7773 MaybeAlign Align(Op.getConstantOperandVal(3));
7774 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
7775 auto PtrVT = getPointerTy(DAG.getDataLayout());
7776 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
7778 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
7779 Chain = VAList.getValue(1);
7780 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
7782 if (VT.isScalableVector())
7783 report_fatal_error("Passing SVE types to variadic functions is "
7784 "currently not supported");
7786 if (Align && *Align > MinSlotSize) {
7787 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7788 DAG.getConstant(Align->value() - 1, DL, PtrVT));
7789 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
7790 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
7793 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
7794 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
7796 // Scalar integer and FP values smaller than 64 bits are implicitly extended
7797 // up to 64 bits. At the very least, we have to increase the striding of the
7798 // vaargs list to match this, and for FP values we need to introduce
7799 // FP_ROUND nodes as well.
7800 if (VT.isInteger() && !VT.isVector())
7801 ArgSize = std::max(ArgSize, MinSlotSize);
7802 bool NeedFPTrunc = false;
7803 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
7808 // Increment the pointer, VAList, to the next vaarg
7809 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7810 DAG.getConstant(ArgSize, DL, PtrVT));
7811 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
7813 // Store the incremented VAList to the legalized pointer
7815 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
7817 // Load the actual argument out of the pointer VAList
7819 // Load the value as an f64.
7821 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
7822 // Round the value down to an f32.
7823 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
7824 DAG.getIntPtrConstant(1, DL));
7825 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
7826 // Merge the rounded value with the chain output of the load.
7827 return DAG.getMergeValues(Ops, DL);
7830 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
7833 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
7834 SelectionDAG &DAG) const {
7835 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7836 MFI.setFrameAddressIsTaken(true);
7838 EVT VT = Op.getValueType();
7840 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7842 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
7844 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
7845 MachinePointerInfo());
7847 if (Subtarget->isTargetILP32())
7848 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
7849 DAG.getValueType(VT));
7854 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
7855 SelectionDAG &DAG) const {
7856 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7858 EVT VT = getPointerTy(DAG.getDataLayout());
7860 int FI = MFI.CreateFixedObject(4, 0, false);
7861 return DAG.getFrameIndex(FI, VT);
7864 #define GET_REGISTER_MATCHER
7865 #include "AArch64GenAsmMatcher.inc"
7867 // FIXME? Maybe this could be a TableGen attribute on some registers and
7868 // this table could be generated automatically from RegInfo.
7869 Register AArch64TargetLowering::
7870 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
7871 Register Reg = MatchRegisterName(RegName);
7872 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
7873 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
7874 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
7875 if (!Subtarget->isXRegisterReserved(DwarfRegNum))
7880 report_fatal_error(Twine("Invalid register name \""
7881 + StringRef(RegName) + "\"."));
7884 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
7885 SelectionDAG &DAG) const {
7886 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
7888 EVT VT = Op.getValueType();
7892 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
7893 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
7895 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
7898 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
7899 SelectionDAG &DAG) const {
7900 MachineFunction &MF = DAG.getMachineFunction();
7901 MachineFrameInfo &MFI = MF.getFrameInfo();
7902 MFI.setReturnAddressIsTaken(true);
7904 EVT VT = Op.getValueType();
7906 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7907 SDValue ReturnAddress;
7909 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
7910 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
7911 ReturnAddress = DAG.getLoad(
7912 VT, DL, DAG.getEntryNode(),
7913 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
7915 // Return LR, which contains the return address. Mark it an implicit
7917 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
7918 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7921 // The XPACLRI instruction assembles to a hint-space instruction before
7922 // Armv8.3-A therefore this instruction can be safely used for any pre
7923 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
7926 if (Subtarget->hasPAuth()) {
7927 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
7929 // XPACLRI operates on LR therefore we must move the operand accordingly.
7931 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
7932 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
7934 return SDValue(St, 0);
7937 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
7938 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
7939 SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
7940 SelectionDAG &DAG) const {
7942 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
7943 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
7946 bool AArch64TargetLowering::isOffsetFoldingLegal(
7947 const GlobalAddressSDNode *GA) const {
7948 // Offsets are folded in the DAG combine rather than here so that we can
7949 // intelligently choose an offset based on the uses.
7953 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
7954 bool OptForSize) const {
7955 bool IsLegal = false;
7956 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
7957 // 16-bit case when target has full fp16 support.
7958 // FIXME: We should be able to handle f128 as well with a clever lowering.
7959 const APInt ImmInt = Imm.bitcastToAPInt();
7961 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
7962 else if (VT == MVT::f32)
7963 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
7964 else if (VT == MVT::f16 && Subtarget->hasFullFP16())
7965 IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
7966 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
7967 // generate that fmov.
7969 // If we can not materialize in immediate field for fmov, check if the
7970 // value can be encoded as the immediate operand of a logical instruction.
7971 // The immediate value will be created with either MOVZ, MOVN, or ORR.
7972 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
7973 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
7974 // however the mov+fmov sequence is always better because of the reduced
7975 // cache pressure. The timings are still the same if you consider
7976 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
7977 // movw+movk is fused). So we limit up to 2 instrdduction at most.
7978 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7979 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
7981 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
7982 IsLegal = Insn.size() <= Limit;
7985 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
7986 << " imm value: "; Imm.dump(););
7990 //===----------------------------------------------------------------------===//
7991 // AArch64 Optimization Hooks
7992 //===----------------------------------------------------------------------===//
7994 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
7995 SDValue Operand, SelectionDAG &DAG,
7997 EVT VT = Operand.getValueType();
7998 if (ST->hasNEON() &&
7999 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
8000 VT == MVT::f32 || VT == MVT::v1f32 ||
8001 VT == MVT::v2f32 || VT == MVT::v4f32)) {
8002 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
8003 // For the reciprocal estimates, convergence is quadratic, so the number
8004 // of digits is doubled after each iteration. In ARMv8, the accuracy of
8005 // the initial estimate is 2^-8. Thus the number of extra steps to refine
8006 // the result for float (23 mantissa bits) is 2 and for double (52
8007 // mantissa bits) is 3.
8008 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
8010 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
8017 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
8018 const DenormalMode &Mode) const {
8020 EVT VT = Op.getValueType();
8021 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
8022 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
8023 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
8027 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
8028 SelectionDAG &DAG) const {
8032 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
8033 SelectionDAG &DAG, int Enabled,
8036 bool Reciprocal) const {
8037 if (Enabled == ReciprocalEstimate::Enabled ||
8038 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
8039 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
8042 EVT VT = Operand.getValueType();
8045 Flags.setAllowReassociation(true);
8047 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
8048 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
8049 for (int i = ExtraSteps; i > 0; --i) {
8050 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
8052 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
8053 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
8056 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
8065 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
8066 SelectionDAG &DAG, int Enabled,
8067 int &ExtraSteps) const {
8068 if (Enabled == ReciprocalEstimate::Enabled)
8069 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
8072 EVT VT = Operand.getValueType();
8075 Flags.setAllowReassociation(true);
8077 // Newton reciprocal iteration: E * (2 - X * E)
8078 // AArch64 reciprocal iteration instruction: (2 - M * N)
8079 for (int i = ExtraSteps; i > 0; --i) {
8080 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
8082 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
8092 //===----------------------------------------------------------------------===//
8093 // AArch64 Inline Assembly Support
8094 //===----------------------------------------------------------------------===//
8096 // Table of Constraints
8097 // TODO: This is the current set of constraints supported by ARM for the
8098 // compiler, not all of them may make sense.
8100 // r - A general register
8101 // w - An FP/SIMD register of some size in the range v0-v31
8102 // x - An FP/SIMD register of some size in the range v0-v15
8103 // I - Constant that can be used with an ADD instruction
8104 // J - Constant that can be used with a SUB instruction
8105 // K - Constant that can be used with a 32-bit logical instruction
8106 // L - Constant that can be used with a 64-bit logical instruction
8107 // M - Constant that can be used as a 32-bit MOV immediate
8108 // N - Constant that can be used as a 64-bit MOV immediate
8109 // Q - A memory reference with base register and no offset
8110 // S - A symbolic address
8111 // Y - Floating point constant zero
8112 // Z - Integer constant zero
8114 // Note that general register operands will be output using their 64-bit x
8115 // register name, whatever the size of the variable, unless the asm operand
8116 // is prefixed by the %w modifier. Floating-point and SIMD register operands
8117 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
8119 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
8120 // At this point, we have to lower this constraint to something else, so we
8121 // lower it to an "r" or "w". However, by doing this we will force the result
8122 // to be in register, while the X constraint is much more permissive.
8124 // Although we are correct (we are free to emit anything, without
8125 // constraints), we might break use cases that would expect us to be more
8126 // efficient and emit something else.
8127 if (!Subtarget->hasFPARMv8())
8130 if (ConstraintVT.isFloatingPoint())
8133 if (ConstraintVT.isVector() &&
8134 (ConstraintVT.getSizeInBits() == 64 ||
8135 ConstraintVT.getSizeInBits() == 128))
8141 enum PredicateConstraint {
8147 static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
8148 PredicateConstraint P = PredicateConstraint::Invalid;
8149 if (Constraint == "Upa")
8150 P = PredicateConstraint::Upa;
8151 if (Constraint == "Upl")
8152 P = PredicateConstraint::Upl;
8156 /// getConstraintType - Given a constraint letter, return the type of
8157 /// constraint it is for this target.
8158 AArch64TargetLowering::ConstraintType
8159 AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
8160 if (Constraint.size() == 1) {
8161 switch (Constraint[0]) {
8167 return C_RegisterClass;
8168 // An address with a single base register. Due to the way we
8169 // currently handle addresses it is the same as 'r'.
8182 case 'S': // A symbolic address
8185 } else if (parsePredicateConstraint(Constraint) !=
8186 PredicateConstraint::Invalid)
8187 return C_RegisterClass;
8188 return TargetLowering::getConstraintType(Constraint);
8191 /// Examine constraint type and operand type and determine a weight value.
8192 /// This object must already have been set up with the operand type
8193 /// and the current alternative constraint selected.
8194 TargetLowering::ConstraintWeight
8195 AArch64TargetLowering::getSingleConstraintMatchWeight(
8196 AsmOperandInfo &info, const char *constraint) const {
8197 ConstraintWeight weight = CW_Invalid;
8198 Value *CallOperandVal = info.CallOperandVal;
8199 // If we don't have a value, we can't do a match,
8200 // but allow it at the lowest weight.
8201 if (!CallOperandVal)
8203 Type *type = CallOperandVal->getType();
8204 // Look at the constraint type.
8205 switch (*constraint) {
8207 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
8212 if (type->isFloatingPointTy() || type->isVectorTy())
8213 weight = CW_Register;
8216 weight = CW_Constant;
8219 if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
8220 weight = CW_Register;
8226 std::pair<unsigned, const TargetRegisterClass *>
8227 AArch64TargetLowering::getRegForInlineAsmConstraint(
8228 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
8229 if (Constraint.size() == 1) {
8230 switch (Constraint[0]) {
8232 if (VT.isScalableVector())
8233 return std::make_pair(0U, nullptr);
8234 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
8235 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
8236 if (VT.getFixedSizeInBits() == 64)
8237 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
8238 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
8240 if (!Subtarget->hasFPARMv8())
8242 if (VT.isScalableVector()) {
8243 if (VT.getVectorElementType() != MVT::i1)
8244 return std::make_pair(0U, &AArch64::ZPRRegClass);
8245 return std::make_pair(0U, nullptr);
8247 uint64_t VTSize = VT.getFixedSizeInBits();
8249 return std::make_pair(0U, &AArch64::FPR16RegClass);
8251 return std::make_pair(0U, &AArch64::FPR32RegClass);
8253 return std::make_pair(0U, &AArch64::FPR64RegClass);
8255 return std::make_pair(0U, &AArch64::FPR128RegClass);
8258 // The instructions that this constraint is designed for can
8259 // only take 128-bit registers so just use that regclass.
8261 if (!Subtarget->hasFPARMv8())
8263 if (VT.isScalableVector())
8264 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
8265 if (VT.getSizeInBits() == 128)
8266 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
8269 if (!Subtarget->hasFPARMv8())
8271 if (VT.isScalableVector())
8272 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
8276 PredicateConstraint PC = parsePredicateConstraint(Constraint);
8277 if (PC != PredicateConstraint::Invalid) {
8278 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
8279 return std::make_pair(0U, nullptr);
8280 bool restricted = (PC == PredicateConstraint::Upl);
8281 return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
8282 : std::make_pair(0U, &AArch64::PPRRegClass);
8285 if (StringRef("{cc}").equals_insensitive(Constraint))
8286 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
8288 // Use the default implementation in TargetLowering to convert the register
8289 // constraint into a member of a register class.
8290 std::pair<unsigned, const TargetRegisterClass *> Res;
8291 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
8293 // Not found as a standard register?
8295 unsigned Size = Constraint.size();
8296 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
8297 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
8299 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
8300 if (!Failed && RegNo >= 0 && RegNo <= 31) {
8301 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
8302 // By default we'll emit v0-v31 for this unless there's a modifier where
8303 // we'll emit the correct register as well.
8304 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
8305 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
8306 Res.second = &AArch64::FPR64RegClass;
8308 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
8309 Res.second = &AArch64::FPR128RegClass;
8315 if (Res.second && !Subtarget->hasFPARMv8() &&
8316 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
8317 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
8318 return std::make_pair(0U, nullptr);
8323 EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
8325 bool AllowUnknown) const {
8326 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
8327 return EVT(MVT::i64x8);
8329 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
8332 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
8333 /// vector. If it is invalid, don't add anything to Ops.
8334 void AArch64TargetLowering::LowerAsmOperandForConstraint(
8335 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
8336 SelectionDAG &DAG) const {
8339 // Currently only support length 1 constraints.
8340 if (Constraint.length() != 1)
8343 char ConstraintLetter = Constraint[0];
8344 switch (ConstraintLetter) {
8348 // This set of constraints deal with valid constants for various instructions.
8349 // Validate and return a target constant for them if we can.
8351 // 'z' maps to xzr or wzr so it needs an input of 0.
8352 if (!isNullConstant(Op))
8355 if (Op.getValueType() == MVT::i64)
8356 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
8358 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
8362 // An absolute symbolic address or label reference.
8363 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
8364 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
8365 GA->getValueType(0));
8366 } else if (const BlockAddressSDNode *BA =
8367 dyn_cast<BlockAddressSDNode>(Op)) {
8369 DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
8381 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
8385 // Grab the value and do some validation.
8386 uint64_t CVal = C->getZExtValue();
8387 switch (ConstraintLetter) {
8388 // The I constraint applies only to simple ADD or SUB immediate operands:
8389 // i.e. 0 to 4095 with optional shift by 12
8390 // The J constraint applies only to ADD or SUB immediates that would be
8391 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
8392 // instruction [or vice versa], in other words -1 to -4095 with optional
8393 // left shift by 12.
8395 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
8399 uint64_t NVal = -C->getSExtValue();
8400 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
8401 CVal = C->getSExtValue();
8406 // The K and L constraints apply *only* to logical immediates, including
8407 // what used to be the MOVI alias for ORR (though the MOVI alias has now
8408 // been removed and MOV should be used). So these constraints have to
8409 // distinguish between bit patterns that are valid 32-bit or 64-bit
8410 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
8411 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
8414 if (AArch64_AM::isLogicalImmediate(CVal, 32))
8418 if (AArch64_AM::isLogicalImmediate(CVal, 64))
8421 // The M and N constraints are a superset of K and L respectively, for use
8422 // with the MOV (immediate) alias. As well as the logical immediates they
8423 // also match 32 or 64-bit immediates that can be loaded either using a
8424 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
8425 // (M) or 64-bit 0x1234000000000000 (N) etc.
8426 // As a note some of this code is liberally stolen from the asm parser.
8428 if (!isUInt<32>(CVal))
8430 if (AArch64_AM::isLogicalImmediate(CVal, 32))
8432 if ((CVal & 0xFFFF) == CVal)
8434 if ((CVal & 0xFFFF0000ULL) == CVal)
8436 uint64_t NCVal = ~(uint32_t)CVal;
8437 if ((NCVal & 0xFFFFULL) == NCVal)
8439 if ((NCVal & 0xFFFF0000ULL) == NCVal)
8444 if (AArch64_AM::isLogicalImmediate(CVal, 64))
8446 if ((CVal & 0xFFFFULL) == CVal)
8448 if ((CVal & 0xFFFF0000ULL) == CVal)
8450 if ((CVal & 0xFFFF00000000ULL) == CVal)
8452 if ((CVal & 0xFFFF000000000000ULL) == CVal)
8454 uint64_t NCVal = ~CVal;
8455 if ((NCVal & 0xFFFFULL) == NCVal)
8457 if ((NCVal & 0xFFFF0000ULL) == NCVal)
8459 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
8461 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
8469 // All assembler immediates are 64-bit integers.
8470 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
8474 if (Result.getNode()) {
8475 Ops.push_back(Result);
8479 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
8482 //===----------------------------------------------------------------------===//
8483 // AArch64 Advanced SIMD Support
8484 //===----------------------------------------------------------------------===//
8486 /// WidenVector - Given a value in the V64 register class, produce the
8487 /// equivalent value in the V128 register class.
8488 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
8489 EVT VT = V64Reg.getValueType();
8490 unsigned NarrowSize = VT.getVectorNumElements();
8491 MVT EltTy = VT.getVectorElementType().getSimpleVT();
8492 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
8495 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
8496 V64Reg, DAG.getConstant(0, DL, MVT::i64));
8499 /// getExtFactor - Determine the adjustment factor for the position when
8500 /// generating an "extract from vector registers" instruction.
8501 static unsigned getExtFactor(SDValue &V) {
8502 EVT EltType = V.getValueType().getVectorElementType();
8503 return EltType.getSizeInBits() / 8;
8506 /// NarrowVector - Given a value in the V128 register class, produce the
8507 /// equivalent value in the V64 register class.
8508 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
8509 EVT VT = V128Reg.getValueType();
8510 unsigned WideSize = VT.getVectorNumElements();
8511 MVT EltTy = VT.getVectorElementType().getSimpleVT();
8512 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
8515 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
8518 // Gather data to see if the operation can be modelled as a
8519 // shuffle in combination with VEXTs.
8520 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
8521 SelectionDAG &DAG) const {
8522 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8523 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
8525 EVT VT = Op.getValueType();
8526 assert(!VT.isScalableVector() &&
8527 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
8528 unsigned NumElts = VT.getVectorNumElements();
8530 struct ShuffleSourceInfo {
8535 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8536 // be compatible with the shuffle we intend to construct. As a result
8537 // ShuffleVec will be some sliding window into the original Vec.
8540 // Code should guarantee that element i in Vec starts at element "WindowBase
8541 // + i * WindowScale in ShuffleVec".
8545 ShuffleSourceInfo(SDValue Vec)
8546 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
8547 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
8549 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8552 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8554 SmallVector<ShuffleSourceInfo, 2> Sources;
8555 for (unsigned i = 0; i < NumElts; ++i) {
8556 SDValue V = Op.getOperand(i);
8559 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8560 !isa<ConstantSDNode>(V.getOperand(1))) {
8562 dbgs() << "Reshuffle failed: "
8563 "a shuffle can only come from building a vector from "
8564 "various elements of other vectors, provided their "
8565 "indices are constant\n");
8569 // Add this element source to the list if it's not already there.
8570 SDValue SourceVec = V.getOperand(0);
8571 auto Source = find(Sources, SourceVec);
8572 if (Source == Sources.end())
8573 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8575 // Update the minimum and maximum lane number seen.
8576 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
8577 Source->MinElt = std::min(Source->MinElt, EltNo);
8578 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8581 if (Sources.size() > 2) {
8583 dbgs() << "Reshuffle failed: currently only do something sane when at "
8584 "most two source vectors are involved\n");
8588 // Find out the smallest element size among result and two sources, and use
8589 // it as element size to build the shuffle_vector.
8590 EVT SmallestEltTy = VT.getVectorElementType();
8591 for (auto &Source : Sources) {
8592 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8593 if (SrcEltTy.bitsLT(SmallestEltTy)) {
8594 SmallestEltTy = SrcEltTy;
8597 unsigned ResMultiplier =
8598 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8599 uint64_t VTSize = VT.getFixedSizeInBits();
8600 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
8601 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8603 // If the source vector is too wide or too narrow, we may nevertheless be able
8604 // to construct a compatible shuffle either by concatenating it with UNDEF or
8605 // extracting a suitable range of elements.
8606 for (auto &Src : Sources) {
8607 EVT SrcVT = Src.ShuffleVec.getValueType();
8609 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8610 if (SrcVTSize == VTSize)
8613 // This stage of the search produces a source with the same element type as
8614 // the original, but with a total width matching the BUILD_VECTOR output.
8615 EVT EltVT = SrcVT.getVectorElementType();
8616 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8617 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8619 if (SrcVTSize < VTSize) {
8620 assert(2 * SrcVTSize == VTSize);
8621 // We can pad out the smaller vector for free, so if it's part of a
8624 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8625 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8629 if (SrcVTSize != 2 * VTSize) {
8631 dbgs() << "Reshuffle failed: result vector too small to extract\n");
8635 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8637 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
8641 if (Src.MinElt >= NumSrcElts) {
8642 // The extraction can just take the second half
8644 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8645 DAG.getConstant(NumSrcElts, dl, MVT::i64));
8646 Src.WindowBase = -NumSrcElts;
8647 } else if (Src.MaxElt < NumSrcElts) {
8648 // The extraction can just take the first half
8650 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8651 DAG.getConstant(0, dl, MVT::i64));
8653 // An actual VEXT is needed
8655 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8656 DAG.getConstant(0, dl, MVT::i64));
8658 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8659 DAG.getConstant(NumSrcElts, dl, MVT::i64));
8660 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
8662 if (!SrcVT.is64BitVector()) {
8664 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
8665 "for SVE vectors.");
8669 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
8671 DAG.getConstant(Imm, dl, MVT::i32));
8672 Src.WindowBase = -Src.MinElt;
8676 // Another possible incompatibility occurs from the vector element types. We
8677 // can fix this by bitcasting the source vectors to the same type we intend
8679 for (auto &Src : Sources) {
8680 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8681 if (SrcEltTy == SmallestEltTy)
8683 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8684 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
8686 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8687 Src.WindowBase *= Src.WindowScale;
8690 // Final sanity check before we try to actually produce a shuffle.
8691 LLVM_DEBUG(for (auto Src
8693 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
8695 // The stars all align, our next step is to produce the mask for the shuffle.
8696 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8697 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8698 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8699 SDValue Entry = Op.getOperand(i);
8700 if (Entry.isUndef())
8703 auto Src = find(Sources, Entry.getOperand(0));
8704 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8706 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8707 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8709 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8710 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8711 VT.getScalarSizeInBits());
8712 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8714 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8715 // starting at the appropriate offset.
8716 int *LaneMask = &Mask[i * ResMultiplier];
8718 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8719 ExtractBase += NumElts * (Src - Sources.begin());
8720 for (int j = 0; j < LanesDefined; ++j)
8721 LaneMask[j] = ExtractBase + j;
8724 // Final check before we try to produce nonsense...
8725 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
8726 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
8730 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8731 for (unsigned i = 0; i < Sources.size(); ++i)
8732 ShuffleOps[i] = Sources[i].ShuffleVec;
8734 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8735 ShuffleOps[1], Mask);
8736 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
8738 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
8739 dbgs() << "Reshuffle, creating node: "; V.dump(););
8744 // check if an EXT instruction can handle the shuffle mask when the
8745 // vector sources of the shuffle are the same.
8746 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
8747 unsigned NumElts = VT.getVectorNumElements();
8749 // Assume that the first shuffle index is not UNDEF. Fail if it is.
8755 // If this is a VEXT shuffle, the immediate value is the index of the first
8756 // element. The other shuffle indices must be the successive elements after
8758 unsigned ExpectedElt = Imm;
8759 for (unsigned i = 1; i < NumElts; ++i) {
8760 // Increment the expected index. If it wraps around, just follow it
8761 // back to index zero and keep going.
8763 if (ExpectedElt == NumElts)
8767 continue; // ignore UNDEF indices
8768 if (ExpectedElt != static_cast<unsigned>(M[i]))
8775 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
8776 /// element width than the vector lane type. If that is the case the function
8777 /// returns true and writes the value of the DUP instruction lane operand into
8779 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
8780 unsigned &DupLaneOp) {
8781 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8782 "Only possible block sizes for wide DUP are: 16, 32, 64");
8784 if (BlockSize <= VT.getScalarSizeInBits())
8786 if (BlockSize % VT.getScalarSizeInBits() != 0)
8788 if (VT.getSizeInBits() % BlockSize != 0)
8791 size_t SingleVecNumElements = VT.getVectorNumElements();
8792 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
8793 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
8795 // We are looking for masks like
8796 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
8797 // might be replaced by 'undefined'. BlockIndices will eventually contain
8798 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
8799 // for the above examples)
8800 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
8801 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
8802 for (size_t I = 0; I < NumEltsPerBlock; I++) {
8803 int Elt = M[BlockIndex * NumEltsPerBlock + I];
8806 // For now we don't support shuffles that use the second operand
8807 if ((unsigned)Elt >= SingleVecNumElements)
8809 if (BlockElts[I] < 0)
8811 else if (BlockElts[I] != Elt)
8815 // We found a candidate block (possibly with some undefs). It must be a
8816 // sequence of consecutive integers starting with a value divisible by
8817 // NumEltsPerBlock with some values possibly replaced by undef-s.
8819 // Find first non-undef element
8820 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
8821 assert(FirstRealEltIter != BlockElts.end() &&
8822 "Shuffle with all-undefs must have been caught by previous cases, "
8824 if (FirstRealEltIter == BlockElts.end()) {
8829 // Index of FirstRealElt in BlockElts
8830 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
8832 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
8834 // BlockElts[0] must have the following value if it isn't undef:
8835 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
8837 // Check the first element
8838 if (Elt0 % NumEltsPerBlock != 0)
8840 // Check that the sequence indeed consists of consecutive integers (modulo
8842 for (size_t I = 0; I < NumEltsPerBlock; I++)
8843 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
8846 DupLaneOp = Elt0 / NumEltsPerBlock;
8850 // check if an EXT instruction can handle the shuffle mask when the
8851 // vector sources of the shuffle are different.
8852 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
8854 // Look for the first non-undef element.
8855 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
8857 // Benefit form APInt to handle overflow when calculating expected element.
8858 unsigned NumElts = VT.getVectorNumElements();
8859 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
8860 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
8861 // The following shuffle indices must be the successive elements after the
8862 // first real element.
8863 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
8864 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
8865 if (FirstWrongElt != M.end())
8868 // The index of an EXT is the first element if it is not UNDEF.
8869 // Watch out for the beginning UNDEFs. The EXT index should be the expected
8870 // value of the first element. E.g.
8871 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
8872 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
8873 // ExpectedElt is the last mask index plus 1.
8874 Imm = ExpectedElt.getZExtValue();
8876 // There are two difference cases requiring to reverse input vectors.
8877 // For example, for vector <4 x i32> we have the following cases,
8878 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
8879 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
8880 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
8881 // to reverse two input vectors.
8890 /// isREVMask - Check if a vector shuffle corresponds to a REV
8891 /// instruction with the specified blocksize. (The order of the elements
8892 /// within each block of the vector is reversed.)
8893 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
8894 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8895 "Only possible block sizes for REV are: 16, 32, 64");
8897 unsigned EltSz = VT.getScalarSizeInBits();
8901 unsigned NumElts = VT.getVectorNumElements();
8902 unsigned BlockElts = M[0] + 1;
8903 // If the first shuffle index is UNDEF, be optimistic.
8905 BlockElts = BlockSize / EltSz;
8907 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
8910 for (unsigned i = 0; i < NumElts; ++i) {
8912 continue; // ignore UNDEF indices
8913 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
8920 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8921 unsigned NumElts = VT.getVectorNumElements();
8922 if (NumElts % 2 != 0)
8924 WhichResult = (M[0] == 0 ? 0 : 1);
8925 unsigned Idx = WhichResult * NumElts / 2;
8926 for (unsigned i = 0; i != NumElts; i += 2) {
8927 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8928 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
8936 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8937 unsigned NumElts = VT.getVectorNumElements();
8938 WhichResult = (M[0] == 0 ? 0 : 1);
8939 for (unsigned i = 0; i != NumElts; ++i) {
8941 continue; // ignore UNDEF indices
8942 if ((unsigned)M[i] != 2 * i + WhichResult)
8949 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8950 unsigned NumElts = VT.getVectorNumElements();
8951 if (NumElts % 2 != 0)
8953 WhichResult = (M[0] == 0 ? 0 : 1);
8954 for (unsigned i = 0; i < NumElts; i += 2) {
8955 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
8956 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
8962 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
8963 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8964 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
8965 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8966 unsigned NumElts = VT.getVectorNumElements();
8967 if (NumElts % 2 != 0)
8969 WhichResult = (M[0] == 0 ? 0 : 1);
8970 unsigned Idx = WhichResult * NumElts / 2;
8971 for (unsigned i = 0; i != NumElts; i += 2) {
8972 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8973 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
8981 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
8982 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8983 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
8984 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8985 unsigned Half = VT.getVectorNumElements() / 2;
8986 WhichResult = (M[0] == 0 ? 0 : 1);
8987 for (unsigned j = 0; j != 2; ++j) {
8988 unsigned Idx = WhichResult;
8989 for (unsigned i = 0; i != Half; ++i) {
8990 int MIdx = M[i + j * Half];
8991 if (MIdx >= 0 && (unsigned)MIdx != Idx)
9000 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
9001 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
9002 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
9003 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
9004 unsigned NumElts = VT.getVectorNumElements();
9005 if (NumElts % 2 != 0)
9007 WhichResult = (M[0] == 0 ? 0 : 1);
9008 for (unsigned i = 0; i < NumElts; i += 2) {
9009 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
9010 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
9016 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
9017 bool &DstIsLeft, int &Anomaly) {
9018 if (M.size() != static_cast<size_t>(NumInputElements))
9021 int NumLHSMatch = 0, NumRHSMatch = 0;
9022 int LastLHSMismatch = -1, LastRHSMismatch = -1;
9024 for (int i = 0; i < NumInputElements; ++i) {
9034 LastLHSMismatch = i;
9036 if (M[i] == i + NumInputElements)
9039 LastRHSMismatch = i;
9042 if (NumLHSMatch == NumInputElements - 1) {
9044 Anomaly = LastLHSMismatch;
9046 } else if (NumRHSMatch == NumInputElements - 1) {
9048 Anomaly = LastRHSMismatch;
9055 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
9056 if (VT.getSizeInBits() != 128)
9059 unsigned NumElts = VT.getVectorNumElements();
9061 for (int I = 0, E = NumElts / 2; I != E; I++) {
9066 int Offset = NumElts / 2;
9067 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
9068 if (Mask[I] != I + SplitLHS * Offset)
9075 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
9077 EVT VT = Op.getValueType();
9078 SDValue V0 = Op.getOperand(0);
9079 SDValue V1 = Op.getOperand(1);
9080 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
9082 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
9083 VT.getVectorElementType() != V1.getValueType().getVectorElementType())
9086 bool SplitV0 = V0.getValueSizeInBits() == 128;
9088 if (!isConcatMask(Mask, VT, SplitV0))
9091 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
9093 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
9094 DAG.getConstant(0, DL, MVT::i64));
9096 if (V1.getValueSizeInBits() == 128) {
9097 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
9098 DAG.getConstant(0, DL, MVT::i64));
9100 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
9103 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
9104 /// the specified operations to build the shuffle.
9105 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
9106 SDValue RHS, SelectionDAG &DAG,
9108 unsigned OpNum = (PFEntry >> 26) & 0x0F;
9109 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
9110 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
9113 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
9122 OP_VUZPL, // VUZP, left result
9123 OP_VUZPR, // VUZP, right result
9124 OP_VZIPL, // VZIP, left result
9125 OP_VZIPR, // VZIP, right result
9126 OP_VTRNL, // VTRN, left result
9127 OP_VTRNR // VTRN, right result
9130 if (OpNum == OP_COPY) {
9131 if (LHSID == (1 * 9 + 2) * 9 + 3)
9133 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
9137 SDValue OpLHS, OpRHS;
9138 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
9139 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
9140 EVT VT = OpLHS.getValueType();
9144 llvm_unreachable("Unknown shuffle opcode!");
9146 // VREV divides the vector in half and swaps within the half.
9147 if (VT.getVectorElementType() == MVT::i32 ||
9148 VT.getVectorElementType() == MVT::f32)
9149 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
9150 // vrev <4 x i16> -> REV32
9151 if (VT.getVectorElementType() == MVT::i16 ||
9152 VT.getVectorElementType() == MVT::f16 ||
9153 VT.getVectorElementType() == MVT::bf16)
9154 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
9155 // vrev <4 x i8> -> REV16
9156 assert(VT.getVectorElementType() == MVT::i8);
9157 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
9162 EVT EltTy = VT.getVectorElementType();
9164 if (EltTy == MVT::i8)
9165 Opcode = AArch64ISD::DUPLANE8;
9166 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
9167 Opcode = AArch64ISD::DUPLANE16;
9168 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
9169 Opcode = AArch64ISD::DUPLANE32;
9170 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
9171 Opcode = AArch64ISD::DUPLANE64;
9173 llvm_unreachable("Invalid vector element type?");
9175 if (VT.getSizeInBits() == 64)
9176 OpLHS = WidenVector(OpLHS, DAG);
9177 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
9178 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
9183 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
9184 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
9185 DAG.getConstant(Imm, dl, MVT::i32));
9188 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
9191 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
9194 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
9197 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
9200 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
9203 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
9208 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
9209 SelectionDAG &DAG) {
9210 // Check to see if we can use the TBL instruction.
9211 SDValue V1 = Op.getOperand(0);
9212 SDValue V2 = Op.getOperand(1);
9215 EVT EltVT = Op.getValueType().getVectorElementType();
9216 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
9218 SmallVector<SDValue, 8> TBLMask;
9219 for (int Val : ShuffleMask) {
9220 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
9221 unsigned Offset = Byte + Val * BytesPerElt;
9222 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
9226 MVT IndexVT = MVT::v8i8;
9227 unsigned IndexLen = 8;
9228 if (Op.getValueSizeInBits() == 128) {
9229 IndexVT = MVT::v16i8;
9233 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
9234 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
9237 if (V2.getNode()->isUndef()) {
9239 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
9240 Shuffle = DAG.getNode(
9241 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
9242 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
9243 DAG.getBuildVector(IndexVT, DL,
9244 makeArrayRef(TBLMask.data(), IndexLen)));
9246 if (IndexLen == 8) {
9247 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
9248 Shuffle = DAG.getNode(
9249 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
9250 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
9251 DAG.getBuildVector(IndexVT, DL,
9252 makeArrayRef(TBLMask.data(), IndexLen)));
9254 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
9255 // cannot currently represent the register constraints on the input
9257 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
9258 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
9260 Shuffle = DAG.getNode(
9261 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
9262 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
9263 V2Cst, DAG.getBuildVector(IndexVT, DL,
9264 makeArrayRef(TBLMask.data(), IndexLen)));
9267 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
9270 static unsigned getDUPLANEOp(EVT EltType) {
9271 if (EltType == MVT::i8)
9272 return AArch64ISD::DUPLANE8;
9273 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
9274 return AArch64ISD::DUPLANE16;
9275 if (EltType == MVT::i32 || EltType == MVT::f32)
9276 return AArch64ISD::DUPLANE32;
9277 if (EltType == MVT::i64 || EltType == MVT::f64)
9278 return AArch64ISD::DUPLANE64;
9280 llvm_unreachable("Invalid vector element type?");
9283 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
9284 unsigned Opcode, SelectionDAG &DAG) {
9285 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
9286 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
9287 // Match: dup (bitcast (extract_subv X, C)), LaneC
9288 if (BitCast.getOpcode() != ISD::BITCAST ||
9289 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
9292 // The extract index must align in the destination type. That may not
9293 // happen if the bitcast is from narrow to wide type.
9294 SDValue Extract = BitCast.getOperand(0);
9295 unsigned ExtIdx = Extract.getConstantOperandVal(1);
9296 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
9297 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
9298 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
9299 if (ExtIdxInBits % CastedEltBitWidth != 0)
9302 // Update the lane value by offsetting with the scaled extract index.
9303 LaneC += ExtIdxInBits / CastedEltBitWidth;
9305 // Determine the casted vector type of the wide vector input.
9306 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
9308 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
9309 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
9310 unsigned SrcVecNumElts =
9311 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
9312 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
9317 if (getScaledOffsetDup(V, Lane, CastVT)) {
9318 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
9319 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
9320 // The lane is incremented by the index of the extract.
9321 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
9322 Lane += V.getConstantOperandVal(1);
9323 V = V.getOperand(0);
9324 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
9325 // The lane is decremented if we are splatting from the 2nd operand.
9326 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
9327 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
9328 Lane -= Idx * VT.getVectorNumElements() / 2;
9329 V = WidenVector(V.getOperand(Idx), DAG);
9330 } else if (VT.getSizeInBits() == 64) {
9331 // Widen the operand to 128-bit register with undef.
9332 V = WidenVector(V, DAG);
9334 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
9337 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
9338 SelectionDAG &DAG) const {
9340 EVT VT = Op.getValueType();
9342 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
9344 if (useSVEForFixedLengthVectorVT(VT))
9345 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
9347 // Convert shuffles that are directly supported on NEON to target-specific
9348 // DAG nodes, instead of keeping them as shuffles and matching them again
9349 // during code selection. This is more efficient and avoids the possibility
9350 // of inconsistencies between legalization and selection.
9351 ArrayRef<int> ShuffleMask = SVN->getMask();
9353 SDValue V1 = Op.getOperand(0);
9354 SDValue V2 = Op.getOperand(1);
9356 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
9357 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
9358 "Unexpected VECTOR_SHUFFLE mask size!");
9360 if (SVN->isSplat()) {
9361 int Lane = SVN->getSplatIndex();
9362 // If this is undef splat, generate it via "just" vdup, if possible.
9366 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
9367 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
9369 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
9370 // constant. If so, we can just reference the lane's definition directly.
9371 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
9372 !isa<ConstantSDNode>(V1.getOperand(Lane)))
9373 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
9375 // Otherwise, duplicate from the lane of the input vector.
9376 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
9377 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
9380 // Check if the mask matches a DUP for a wider element
9381 for (unsigned LaneSize : {64U, 32U, 16U}) {
9383 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
9384 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
9385 : LaneSize == 32 ? AArch64ISD::DUPLANE32
9386 : AArch64ISD::DUPLANE16;
9387 // Cast V1 to an integer vector with required lane size
9388 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
9389 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
9390 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
9391 V1 = DAG.getBitcast(NewVecTy, V1);
9392 // Constuct the DUP instruction
9393 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
9394 // Cast back to the original type
9395 return DAG.getBitcast(VT, V1);
9399 if (isREVMask(ShuffleMask, VT, 64))
9400 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
9401 if (isREVMask(ShuffleMask, VT, 32))
9402 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
9403 if (isREVMask(ShuffleMask, VT, 16))
9404 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
9406 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
9407 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
9408 ShuffleVectorInst::isReverseMask(ShuffleMask)) {
9409 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
9410 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
9411 DAG.getConstant(8, dl, MVT::i32));
9414 bool ReverseEXT = false;
9416 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
9419 Imm *= getExtFactor(V1);
9420 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
9421 DAG.getConstant(Imm, dl, MVT::i32));
9422 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
9423 Imm *= getExtFactor(V1);
9424 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
9425 DAG.getConstant(Imm, dl, MVT::i32));
9428 unsigned WhichResult;
9429 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
9430 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
9431 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
9433 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
9434 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
9435 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
9437 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
9438 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
9439 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
9442 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
9443 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
9444 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
9446 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
9447 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
9448 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
9450 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
9451 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
9452 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
9455 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
9460 int NumInputElements = V1.getValueType().getVectorNumElements();
9461 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
9462 SDValue DstVec = DstIsLeft ? V1 : V2;
9463 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
9465 SDValue SrcVec = V1;
9466 int SrcLane = ShuffleMask[Anomaly];
9467 if (SrcLane >= NumInputElements) {
9469 SrcLane -= VT.getVectorNumElements();
9471 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
9473 EVT ScalarVT = VT.getVectorElementType();
9475 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
9476 ScalarVT = MVT::i32;
9479 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9480 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
9484 // If the shuffle is not directly supported and it has 4 elements, use
9485 // the PerfectShuffle-generated table to synthesize it from other shuffles.
9486 unsigned NumElts = VT.getVectorNumElements();
9488 unsigned PFIndexes[4];
9489 for (unsigned i = 0; i != 4; ++i) {
9490 if (ShuffleMask[i] < 0)
9493 PFIndexes[i] = ShuffleMask[i];
9496 // Compute the index in the perfect shuffle table.
9497 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
9498 PFIndexes[2] * 9 + PFIndexes[3];
9499 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
9500 unsigned Cost = (PFEntry >> 30);
9503 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
9506 return GenerateTBL(Op, ShuffleMask, DAG);
9509 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
9510 SelectionDAG &DAG) const {
9512 EVT VT = Op.getValueType();
9513 EVT ElemVT = VT.getScalarType();
9514 SDValue SplatVal = Op.getOperand(0);
9516 if (useSVEForFixedLengthVectorVT(VT))
9517 return LowerToScalableOp(Op, DAG);
9519 // Extend input splat value where needed to fit into a GPR (32b or 64b only)
9520 // FPRs don't have this restriction.
9521 switch (ElemVT.getSimpleVT().SimpleTy) {
9523 // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
9525 if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
9526 if (ConstVal->isOne())
9527 return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
9528 // TODO: Add special case for constant false
9530 // The general case of i1. There isn't any natural way to do this,
9531 // so we use some trickery with whilelo.
9532 SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
9533 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
9534 DAG.getValueType(MVT::i1));
9535 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
9537 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
9538 DAG.getConstant(0, dl, MVT::i64), SplatVal);
9543 SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
9546 SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
9555 report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
9558 return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
9561 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
9562 SelectionDAG &DAG) const {
9565 EVT VT = Op.getValueType();
9566 if (!isTypeLegal(VT) || !VT.isScalableVector())
9569 // Current lowering only supports the SVE-ACLE types.
9570 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
9573 // The DUPQ operation is indepedent of element type so normalise to i64s.
9574 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
9575 SDValue Idx128 = Op.getOperand(2);
9577 // DUPQ can be used when idx is in range.
9578 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
9579 if (CIdx && (CIdx->getZExtValue() <= 3)) {
9580 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
9582 DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
9583 return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
9586 // The ACLE says this must produce the same result as:
9587 // svtbl(data, svadd_x(svptrue_b64(),
9588 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
9590 SDValue One = DAG.getConstant(1, DL, MVT::i64);
9591 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
9593 // create the vector 0,1,0,1,...
9594 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
9595 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
9597 // create the vector idx64,idx64+1,idx64,idx64+1,...
9598 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
9599 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
9600 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
9602 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
9603 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
9604 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
9608 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
9610 EVT VT = BVN->getValueType(0);
9611 APInt SplatBits, SplatUndef;
9612 unsigned SplatBitSize;
9614 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9615 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
9617 for (unsigned i = 0; i < NumSplats; ++i) {
9618 CnstBits <<= SplatBitSize;
9619 UndefBits <<= SplatBitSize;
9620 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
9621 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
9630 // Try 64-bit splatted SIMD immediate.
9631 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9632 const APInt &Bits) {
9633 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9634 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9635 EVT VT = Op.getValueType();
9636 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
9638 if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
9639 Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
9642 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9643 DAG.getConstant(Value, dl, MVT::i32));
9644 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9651 // Try 32-bit splatted SIMD immediate.
9652 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9654 const SDValue *LHS = nullptr) {
9655 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9656 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9657 EVT VT = Op.getValueType();
9658 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9659 bool isAdvSIMDModImm = false;
9662 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
9663 Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
9666 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
9667 Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
9670 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
9671 Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
9674 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
9675 Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
9679 if (isAdvSIMDModImm) {
9684 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9685 DAG.getConstant(Value, dl, MVT::i32),
9686 DAG.getConstant(Shift, dl, MVT::i32));
9688 Mov = DAG.getNode(NewOp, dl, MovTy,
9689 DAG.getConstant(Value, dl, MVT::i32),
9690 DAG.getConstant(Shift, dl, MVT::i32));
9692 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9699 // Try 16-bit splatted SIMD immediate.
9700 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9702 const SDValue *LHS = nullptr) {
9703 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9704 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9705 EVT VT = Op.getValueType();
9706 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
9707 bool isAdvSIMDModImm = false;
9710 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
9711 Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
9714 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
9715 Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
9719 if (isAdvSIMDModImm) {
9724 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9725 DAG.getConstant(Value, dl, MVT::i32),
9726 DAG.getConstant(Shift, dl, MVT::i32));
9728 Mov = DAG.getNode(NewOp, dl, MovTy,
9729 DAG.getConstant(Value, dl, MVT::i32),
9730 DAG.getConstant(Shift, dl, MVT::i32));
9732 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9739 // Try 32-bit splatted SIMD immediate with shifted ones.
9740 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
9741 SelectionDAG &DAG, const APInt &Bits) {
9742 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9743 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9744 EVT VT = Op.getValueType();
9745 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9746 bool isAdvSIMDModImm = false;
9749 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
9750 Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
9753 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
9754 Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
9758 if (isAdvSIMDModImm) {
9760 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9761 DAG.getConstant(Value, dl, MVT::i32),
9762 DAG.getConstant(Shift, dl, MVT::i32));
9763 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9770 // Try 8-bit splatted SIMD immediate.
9771 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9772 const APInt &Bits) {
9773 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9774 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9775 EVT VT = Op.getValueType();
9776 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
9778 if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
9779 Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
9782 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9783 DAG.getConstant(Value, dl, MVT::i32));
9784 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9791 // Try FP splatted SIMD immediate.
9792 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9793 const APInt &Bits) {
9794 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9795 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9796 EVT VT = Op.getValueType();
9797 bool isWide = (VT.getSizeInBits() == 128);
9799 bool isAdvSIMDModImm = false;
9801 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
9802 Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
9803 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
9806 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
9807 Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
9811 if (isAdvSIMDModImm) {
9813 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9814 DAG.getConstant(Value, dl, MVT::i32));
9815 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9822 // Specialized code to quickly find if PotentialBVec is a BuildVector that
9823 // consists of only the same constant int value, returned in reference arg
9825 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
9826 uint64_t &ConstVal) {
9827 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
9830 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
9833 EVT VT = Bvec->getValueType(0);
9834 unsigned NumElts = VT.getVectorNumElements();
9835 for (unsigned i = 1; i < NumElts; ++i)
9836 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
9838 ConstVal = FirstElt->getZExtValue();
9842 static unsigned getIntrinsicID(const SDNode *N) {
9843 unsigned Opcode = N->getOpcode();
9846 return Intrinsic::not_intrinsic;
9847 case ISD::INTRINSIC_WO_CHAIN: {
9848 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9849 if (IID < Intrinsic::num_intrinsics)
9851 return Intrinsic::not_intrinsic;
9856 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
9857 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
9858 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
9859 // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
9860 // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
9861 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
9862 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
9863 EVT VT = N->getValueType(0);
9873 SDValue FirstOp = N->getOperand(0);
9874 unsigned FirstOpc = FirstOp.getOpcode();
9875 SDValue SecondOp = N->getOperand(1);
9876 unsigned SecondOpc = SecondOp.getOpcode();
9878 // Is one of the operands an AND or a BICi? The AND may have been optimised to
9879 // a BICi in order to use an immediate instead of a register.
9880 // Is the other operand an shl or lshr? This will have been turned into:
9881 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
9882 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
9883 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
9887 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
9888 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
9894 bool IsAnd = And.getOpcode() == ISD::AND;
9895 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
9897 // Is the shift amount constant?
9898 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
9904 // Is the and mask vector all constant?
9905 if (!isAllConstantBuildVector(And.getOperand(1), C1))
9908 // Reconstruct the corresponding AND immediate from the two BICi immediates.
9909 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
9910 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
9911 assert(C1nodeImm && C1nodeShift);
9912 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
9915 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
9916 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
9917 // how much one can shift elements of a particular size?
9918 uint64_t C2 = C2node->getZExtValue();
9919 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
9920 if (C2 > ElemSizeInBits)
9923 APInt C1AsAPInt(ElemSizeInBits, C1);
9924 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
9925 : APInt::getLowBitsSet(ElemSizeInBits, C2);
9926 if (C1AsAPInt != RequiredC1)
9929 SDValue X = And.getOperand(0);
9930 SDValue Y = Shift.getOperand(0);
9932 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
9933 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
9935 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
9936 LLVM_DEBUG(N->dump(&DAG));
9937 LLVM_DEBUG(dbgs() << "into: \n");
9938 LLVM_DEBUG(ResultSLI->dump(&DAG));
9944 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
9945 SelectionDAG &DAG) const {
9946 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
9947 return LowerToScalableOp(Op, DAG);
9949 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
9950 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
9953 EVT VT = Op.getValueType();
9955 SDValue LHS = Op.getOperand(0);
9956 BuildVectorSDNode *BVN =
9957 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
9959 // OR commutes, so try swapping the operands.
9960 LHS = Op.getOperand(1);
9961 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
9966 APInt DefBits(VT.getSizeInBits(), 0);
9967 APInt UndefBits(VT.getSizeInBits(), 0);
9968 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
9971 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
9973 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
9977 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
9978 UndefBits, &LHS)) ||
9979 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
9984 // We can always fall back to a non-immediate OR.
9988 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
9989 // be truncated to fit element width.
9990 static SDValue NormalizeBuildVector(SDValue Op,
9991 SelectionDAG &DAG) {
9992 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
9994 EVT VT = Op.getValueType();
9995 EVT EltTy= VT.getVectorElementType();
9997 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
10000 SmallVector<SDValue, 16> Ops;
10001 for (SDValue Lane : Op->ops()) {
10002 // For integer vectors, type legalization would have promoted the
10003 // operands already. Otherwise, if Op is a floating-point splat
10004 // (with operands cast to integers), then the only possibilities
10005 // are constants and UNDEFs.
10006 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
10007 APInt LowBits(EltTy.getSizeInBits(),
10008 CstLane->getZExtValue());
10009 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
10010 } else if (Lane.getNode()->isUndef()) {
10011 Lane = DAG.getUNDEF(MVT::i32);
10013 assert(Lane.getValueType() == MVT::i32 &&
10014 "Unexpected BUILD_VECTOR operand type");
10016 Ops.push_back(Lane);
10018 return DAG.getBuildVector(VT, dl, Ops);
10021 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
10022 EVT VT = Op.getValueType();
10024 APInt DefBits(VT.getSizeInBits(), 0);
10025 APInt UndefBits(VT.getSizeInBits(), 0);
10026 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
10027 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
10029 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
10030 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
10031 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
10032 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
10033 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
10034 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
10037 DefBits = ~DefBits;
10038 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
10039 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
10040 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
10043 DefBits = UndefBits;
10044 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
10045 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
10046 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
10047 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
10048 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
10049 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
10052 DefBits = ~UndefBits;
10053 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
10054 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
10055 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
10062 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
10063 SelectionDAG &DAG) const {
10064 EVT VT = Op.getValueType();
10066 // Try to build a simple constant vector.
10067 Op = NormalizeBuildVector(Op, DAG);
10068 if (VT.isInteger()) {
10069 // Certain vector constants, used to express things like logical NOT and
10070 // arithmetic NEG, are passed through unmodified. This allows special
10071 // patterns for these operations to match, which will lower these constants
10072 // to whatever is proven necessary.
10073 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
10074 if (BVN->isConstant())
10075 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
10076 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
10078 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
10079 if (Val.isNullValue() || Val.isAllOnesValue())
10084 if (SDValue V = ConstantBuildVector(Op, DAG))
10087 // Scan through the operands to find some interesting properties we can
10089 // 1) If only one value is used, we can use a DUP, or
10090 // 2) if only the low element is not undef, we can just insert that, or
10091 // 3) if only one constant value is used (w/ some non-constant lanes),
10092 // we can splat the constant value into the whole vector then fill
10093 // in the non-constant lanes.
10094 // 4) FIXME: If different constant values are used, but we can intelligently
10095 // select the values we'll be overwriting for the non-constant
10096 // lanes such that we can directly materialize the vector
10097 // some other way (MOVI, e.g.), we can be sneaky.
10098 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
10100 unsigned NumElts = VT.getVectorNumElements();
10101 bool isOnlyLowElement = true;
10102 bool usesOnlyOneValue = true;
10103 bool usesOnlyOneConstantValue = true;
10104 bool isConstant = true;
10105 bool AllLanesExtractElt = true;
10106 unsigned NumConstantLanes = 0;
10107 unsigned NumDifferentLanes = 0;
10108 unsigned NumUndefLanes = 0;
10110 SDValue ConstantValue;
10111 for (unsigned i = 0; i < NumElts; ++i) {
10112 SDValue V = Op.getOperand(i);
10113 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10114 AllLanesExtractElt = false;
10120 isOnlyLowElement = false;
10121 if (!isIntOrFPConstant(V))
10122 isConstant = false;
10124 if (isIntOrFPConstant(V)) {
10125 ++NumConstantLanes;
10126 if (!ConstantValue.getNode())
10128 else if (ConstantValue != V)
10129 usesOnlyOneConstantValue = false;
10132 if (!Value.getNode())
10134 else if (V != Value) {
10135 usesOnlyOneValue = false;
10136 ++NumDifferentLanes;
10140 if (!Value.getNode()) {
10142 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
10143 return DAG.getUNDEF(VT);
10146 // Convert BUILD_VECTOR where all elements but the lowest are undef into
10147 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
10148 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
10149 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
10150 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
10151 "SCALAR_TO_VECTOR node\n");
10152 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
10155 if (AllLanesExtractElt) {
10156 SDNode *Vector = nullptr;
10159 // Check whether the extract elements match the Even pattern <0,2,4,...> or
10160 // the Odd pattern <1,3,5,...>.
10161 for (unsigned i = 0; i < NumElts; ++i) {
10162 SDValue V = Op.getOperand(i);
10163 const SDNode *N = V.getNode();
10164 if (!isa<ConstantSDNode>(N->getOperand(1)))
10166 SDValue N0 = N->getOperand(0);
10168 // All elements are extracted from the same vector.
10170 Vector = N0.getNode();
10171 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
10173 if (VT.getVectorElementType() !=
10174 N0.getValueType().getVectorElementType())
10176 } else if (Vector != N0.getNode()) {
10182 // Extracted values are either at Even indices <0,2,4,...> or at Odd
10183 // indices <1,3,5,...>.
10184 uint64_t Val = N->getConstantOperandVal(1);
10185 if (Val == 2 * i) {
10189 if (Val - 1 == 2 * i) {
10194 // Something does not match: abort.
10201 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
10202 DAG.getConstant(0, dl, MVT::i64));
10204 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
10205 DAG.getConstant(NumElts, dl, MVT::i64));
10208 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
10211 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
10216 // Use DUP for non-constant splats. For f32 constant splats, reduce to
10217 // i32 and try again.
10218 if (usesOnlyOneValue) {
10220 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10221 Value.getValueType() != VT) {
10223 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
10224 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
10227 // This is actually a DUPLANExx operation, which keeps everything vectory.
10229 SDValue Lane = Value.getOperand(1);
10230 Value = Value.getOperand(0);
10231 if (Value.getValueSizeInBits() == 64) {
10233 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
10235 Value = WidenVector(Value, DAG);
10238 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
10239 return DAG.getNode(Opcode, dl, VT, Value, Lane);
10242 if (VT.getVectorElementType().isFloatingPoint()) {
10243 SmallVector<SDValue, 8> Ops;
10244 EVT EltTy = VT.getVectorElementType();
10245 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
10246 EltTy == MVT::f64) && "Unsupported floating-point vector type");
10248 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
10249 "BITCASTS, and try again\n");
10250 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
10251 for (unsigned i = 0; i < NumElts; ++i)
10252 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
10253 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
10254 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
10255 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
10257 Val = LowerBUILD_VECTOR(Val, DAG);
10259 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
10263 // If we need to insert a small number of different non-constant elements and
10264 // the vector width is sufficiently large, prefer using DUP with the common
10265 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
10266 // skip the constant lane handling below.
10267 bool PreferDUPAndInsert =
10268 !isConstant && NumDifferentLanes >= 1 &&
10269 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
10270 NumDifferentLanes >= NumConstantLanes;
10272 // If there was only one constant value used and for more than one lane,
10273 // start by splatting that value, then replace the non-constant lanes. This
10274 // is better than the default, which will perform a separate initialization
10276 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
10277 // Firstly, try to materialize the splat constant.
10278 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
10279 Val = ConstantBuildVector(Vec, DAG);
10281 // Otherwise, materialize the constant and splat it.
10282 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
10283 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
10286 // Now insert the non-constant lanes.
10287 for (unsigned i = 0; i < NumElts; ++i) {
10288 SDValue V = Op.getOperand(i);
10289 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
10290 if (!isIntOrFPConstant(V))
10291 // Note that type legalization likely mucked about with the VT of the
10292 // source operand, so we may have to convert it here before inserting.
10293 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
10298 // This will generate a load from the constant pool.
10301 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
10306 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
10307 if (NumElts >= 4) {
10308 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
10312 if (PreferDUPAndInsert) {
10313 // First, build a constant vector with the common element.
10314 SmallVector<SDValue, 8> Ops(NumElts, Value);
10315 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
10316 // Next, insert the elements that do not match the common value.
10317 for (unsigned I = 0; I < NumElts; ++I)
10318 if (Op.getOperand(I) != Value)
10320 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
10321 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
10326 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
10327 // know the default expansion would otherwise fall back on something even
10328 // worse. For a vector with one or two non-undef values, that's
10329 // scalar_to_vector for the elements followed by a shuffle (provided the
10330 // shuffle is valid for the target) and materialization element by element
10331 // on the stack followed by a load for everything else.
10332 if (!isConstant && !usesOnlyOneValue) {
10334 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
10335 "of INSERT_VECTOR_ELT\n");
10337 SDValue Vec = DAG.getUNDEF(VT);
10338 SDValue Op0 = Op.getOperand(0);
10341 // Use SCALAR_TO_VECTOR for lane zero to
10342 // a) Avoid a RMW dependency on the full vector register, and
10343 // b) Allow the register coalescer to fold away the copy if the
10344 // value is already in an S or D register, and we're forced to emit an
10345 // INSERT_SUBREG that we can't fold anywhere.
10347 // We also allow types like i8 and i16 which are illegal scalar but legal
10348 // vector element types. After type-legalization the inserted value is
10349 // extended (i32) and it is safe to cast them to the vector type by ignoring
10350 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
10351 if (!Op0.isUndef()) {
10352 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
10353 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
10356 LLVM_DEBUG(if (i < NumElts) dbgs()
10357 << "Creating nodes for the other vector elements:\n";);
10358 for (; i < NumElts; ++i) {
10359 SDValue V = Op.getOperand(i);
10362 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
10363 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
10369 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
10370 "better alternative\n");
10374 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
10375 SelectionDAG &DAG) const {
10376 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
10377 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
10379 assert(Op.getValueType().isScalableVector() &&
10380 isTypeLegal(Op.getValueType()) &&
10381 "Expected legal scalable vector type!");
10383 if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
10389 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
10390 SelectionDAG &DAG) const {
10391 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
10393 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
10394 return LowerFixedLengthInsertVectorElt(Op, DAG);
10396 // Check for non-constant or out of range lane.
10397 EVT VT = Op.getOperand(0).getValueType();
10399 if (VT.getScalarType() == MVT::i1) {
10400 EVT VectorVT = getPromotedVTForPredicate(VT);
10402 SDValue ExtendedVector =
10403 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
10404 SDValue ExtendedValue =
10405 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
10406 VectorVT.getScalarType().getSizeInBits() < 32
10408 : VectorVT.getScalarType());
10410 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
10411 ExtendedValue, Op.getOperand(2));
10412 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
10415 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
10416 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
10419 // Insertion/extraction are legal for V128 types.
10420 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
10421 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
10422 VT == MVT::v8f16 || VT == MVT::v8bf16)
10425 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
10426 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
10430 // For V64 types, we perform insertion by expanding the value
10431 // to a V128 type and perform the insertion on that.
10433 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
10434 EVT WideTy = WideVec.getValueType();
10436 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
10437 Op.getOperand(1), Op.getOperand(2));
10438 // Re-narrow the resultant vector.
10439 return NarrowVector(Node, DAG);
10443 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
10444 SelectionDAG &DAG) const {
10445 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
10446 EVT VT = Op.getOperand(0).getValueType();
10448 if (VT.getScalarType() == MVT::i1) {
10449 // We can't directly extract from an SVE predicate; extend it first.
10450 // (This isn't the only possible lowering, but it's straightforward.)
10451 EVT VectorVT = getPromotedVTForPredicate(VT);
10454 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
10455 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
10456 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
10457 Extend, Op.getOperand(1));
10458 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
10461 if (useSVEForFixedLengthVectorVT(VT))
10462 return LowerFixedLengthExtractVectorElt(Op, DAG);
10464 // Check for non-constant or out of range lane.
10465 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
10466 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
10469 // Insertion/extraction are legal for V128 types.
10470 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
10471 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
10472 VT == MVT::v8f16 || VT == MVT::v8bf16)
10475 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
10476 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
10480 // For V64 types, we perform extraction by expanding the value
10481 // to a V128 type and perform the extraction on that.
10483 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
10484 EVT WideTy = WideVec.getValueType();
10486 EVT ExtrTy = WideTy.getVectorElementType();
10487 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
10490 // For extractions, we just return the result directly.
10491 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
10495 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
10496 SelectionDAG &DAG) const {
10497 assert(Op.getValueType().isFixedLengthVector() &&
10498 "Only cases that extract a fixed length vector are supported!");
10500 EVT InVT = Op.getOperand(0).getValueType();
10501 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10502 unsigned Size = Op.getValueSizeInBits();
10504 if (InVT.isScalableVector()) {
10505 // This will be matched by custom code during ISelDAGToDAG.
10506 if (Idx == 0 && isPackedVectorType(InVT, DAG))
10512 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
10513 if (Idx == 0 && InVT.getSizeInBits() <= 128)
10516 // If this is extracting the upper 64-bits of a 128-bit vector, we match
10518 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
10519 InVT.getSizeInBits() == 128)
10525 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
10526 SelectionDAG &DAG) const {
10527 assert(Op.getValueType().isScalableVector() &&
10528 "Only expect to lower inserts into scalable vectors!");
10530 EVT InVT = Op.getOperand(1).getValueType();
10531 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
10533 if (InVT.isScalableVector()) {
10535 EVT VT = Op.getValueType();
10537 if (!isTypeLegal(VT) || !VT.isInteger())
10540 SDValue Vec0 = Op.getOperand(0);
10541 SDValue Vec1 = Op.getOperand(1);
10543 // Ensure the subvector is half the size of the main vector.
10544 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
10547 // Extend elements of smaller vector...
10548 EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
10549 SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
10552 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
10553 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
10554 } else if (Idx == InVT.getVectorMinNumElements()) {
10555 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
10556 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
10562 // This will be matched by custom code during ISelDAGToDAG.
10563 if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
10569 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
10570 EVT VT = Op.getValueType();
10572 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
10573 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
10575 assert(VT.isScalableVector() && "Expected a scalable vector.");
10577 bool Signed = Op.getOpcode() == ISD::SDIV;
10578 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
10580 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
10581 return LowerToPredicatedOp(Op, DAG, PredOpcode);
10583 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
10584 // operations, and truncate the result.
10586 if (VT == MVT::nxv16i8)
10587 WidenedVT = MVT::nxv8i16;
10588 else if (VT == MVT::nxv8i16)
10589 WidenedVT = MVT::nxv4i32;
10591 llvm_unreachable("Unexpected Custom DIV operation");
10594 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
10595 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
10596 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
10597 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
10598 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
10599 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
10600 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
10601 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
10602 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
10605 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
10606 // Currently no fixed length shuffles that require SVE are legal.
10607 if (useSVEForFixedLengthVectorVT(VT))
10610 if (VT.getVectorNumElements() == 4 &&
10611 (VT.is128BitVector() || VT.is64BitVector())) {
10612 unsigned PFIndexes[4];
10613 for (unsigned i = 0; i != 4; ++i) {
10617 PFIndexes[i] = M[i];
10620 // Compute the index in the perfect shuffle table.
10621 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10622 PFIndexes[2] * 9 + PFIndexes[3];
10623 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10624 unsigned Cost = (PFEntry >> 30);
10632 unsigned DummyUnsigned;
10634 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
10635 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
10636 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
10637 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
10638 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
10639 isZIPMask(M, VT, DummyUnsigned) ||
10640 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
10641 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
10642 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
10643 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
10644 isConcatMask(M, VT, VT.getSizeInBits() == 128));
10647 /// getVShiftImm - Check if this is a valid build_vector for the immediate
10648 /// operand of a vector shift operation, where all the elements of the
10649 /// build_vector must have the same constant integer value.
10650 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
10651 // Ignore bit_converts.
10652 while (Op.getOpcode() == ISD::BITCAST)
10653 Op = Op.getOperand(0);
10654 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
10655 APInt SplatBits, SplatUndef;
10656 unsigned SplatBitSize;
10658 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
10659 HasAnyUndefs, ElementBits) ||
10660 SplatBitSize > ElementBits)
10662 Cnt = SplatBits.getSExtValue();
10666 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
10667 /// operand of a vector shift left operation. That value must be in the range:
10668 /// 0 <= Value < ElementBits for a left shift; or
10669 /// 0 <= Value <= ElementBits for a long left shift.
10670 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
10671 assert(VT.isVector() && "vector shift count is not a vector type");
10672 int64_t ElementBits = VT.getScalarSizeInBits();
10673 if (!getVShiftImm(Op, ElementBits, Cnt))
10675 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
10678 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
10679 /// operand of a vector shift right operation. The value must be in the range:
10680 /// 1 <= Value <= ElementBits for a right shift; or
10681 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
10682 assert(VT.isVector() && "vector shift count is not a vector type");
10683 int64_t ElementBits = VT.getScalarSizeInBits();
10684 if (!getVShiftImm(Op, ElementBits, Cnt))
10686 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
10689 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
10690 SelectionDAG &DAG) const {
10691 EVT VT = Op.getValueType();
10693 if (VT.getScalarType() == MVT::i1) {
10694 // Lower i1 truncate to `(x & 1) != 0`.
10696 EVT OpVT = Op.getOperand(0).getValueType();
10697 SDValue Zero = DAG.getConstant(0, dl, OpVT);
10698 SDValue One = DAG.getConstant(1, dl, OpVT);
10699 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
10700 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
10703 if (!VT.isVector() || VT.isScalableVector())
10706 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10707 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
10712 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
10713 SelectionDAG &DAG) const {
10714 EVT VT = Op.getValueType();
10718 if (!Op.getOperand(1).getValueType().isVector())
10720 unsigned EltSize = VT.getScalarSizeInBits();
10722 switch (Op.getOpcode()) {
10724 llvm_unreachable("unexpected shift opcode");
10727 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
10728 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
10730 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
10731 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
10732 DAG.getConstant(Cnt, DL, MVT::i32));
10733 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10734 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
10736 Op.getOperand(0), Op.getOperand(1));
10739 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
10740 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
10741 : AArch64ISD::SRL_PRED;
10742 return LowerToPredicatedOp(Op, DAG, Opc);
10745 // Right shift immediate
10746 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
10748 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
10749 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
10750 DAG.getConstant(Cnt, DL, MVT::i32));
10753 // Right shift register. Note, there is not a shift right register
10754 // instruction, but the shift left register instruction takes a signed
10755 // value, where negative numbers specify a right shift.
10756 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
10757 : Intrinsic::aarch64_neon_ushl;
10758 // negate the shift amount
10759 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
10761 SDValue NegShiftLeft =
10762 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10763 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
10765 return NegShiftLeft;
10771 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
10772 AArch64CC::CondCode CC, bool NoNans, EVT VT,
10773 const SDLoc &dl, SelectionDAG &DAG) {
10774 EVT SrcVT = LHS.getValueType();
10775 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
10776 "function only supposed to emit natural comparisons");
10778 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
10779 APInt CnstBits(VT.getSizeInBits(), 0);
10780 APInt UndefBits(VT.getSizeInBits(), 0);
10781 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
10782 bool IsZero = IsCnst && (CnstBits == 0);
10784 if (SrcVT.getVectorElementType().isFloatingPoint()) {
10788 case AArch64CC::NE: {
10791 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10793 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10794 return DAG.getNOT(dl, Fcmeq, VT);
10796 case AArch64CC::EQ:
10798 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10799 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10800 case AArch64CC::GE:
10802 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
10803 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
10804 case AArch64CC::GT:
10806 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
10807 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
10808 case AArch64CC::LS:
10810 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
10811 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
10812 case AArch64CC::LT:
10815 // If we ignore NaNs then we can use to the MI implementation.
10817 case AArch64CC::MI:
10819 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
10820 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
10827 case AArch64CC::NE: {
10830 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10832 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10833 return DAG.getNOT(dl, Cmeq, VT);
10835 case AArch64CC::EQ:
10837 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10838 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10839 case AArch64CC::GE:
10841 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
10842 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
10843 case AArch64CC::GT:
10845 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
10846 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
10847 case AArch64CC::LE:
10849 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
10850 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
10851 case AArch64CC::LS:
10852 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
10853 case AArch64CC::LO:
10854 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
10855 case AArch64CC::LT:
10857 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
10858 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
10859 case AArch64CC::HI:
10860 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
10861 case AArch64CC::HS:
10862 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
10866 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
10867 SelectionDAG &DAG) const {
10868 if (Op.getValueType().isScalableVector())
10869 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
10871 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10872 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
10874 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
10875 SDValue LHS = Op.getOperand(0);
10876 SDValue RHS = Op.getOperand(1);
10877 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
10880 if (LHS.getValueType().getVectorElementType().isInteger()) {
10881 assert(LHS.getValueType() == RHS.getValueType());
10882 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
10884 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
10885 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10888 const bool FullFP16 =
10889 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
10891 // Make v4f16 (only) fcmp operations utilise vector instructions
10892 // v8f16 support will be a litle more complicated
10893 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
10894 if (LHS.getValueType().getVectorNumElements() == 4) {
10895 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
10896 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
10897 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
10898 DAG.ReplaceAllUsesWith(Op, NewSetcc);
10899 CmpVT = MVT::v4i32;
10904 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
10905 LHS.getValueType().getVectorElementType() != MVT::f128);
10907 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10908 // clean. Some of them require two branches to implement.
10909 AArch64CC::CondCode CC1, CC2;
10911 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
10913 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
10915 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
10916 if (!Cmp.getNode())
10919 if (CC2 != AArch64CC::AL) {
10921 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
10922 if (!Cmp2.getNode())
10925 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
10928 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10931 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
10936 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
10937 SelectionDAG &DAG) {
10938 SDValue VecOp = ScalarOp.getOperand(0);
10939 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
10940 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
10941 DAG.getConstant(0, DL, MVT::i64));
10944 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
10945 SelectionDAG &DAG) const {
10946 SDValue Src = Op.getOperand(0);
10948 // Try to lower fixed length reductions to SVE.
10949 EVT SrcVT = Src.getValueType();
10950 bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
10951 Op.getOpcode() == ISD::VECREDUCE_OR ||
10952 Op.getOpcode() == ISD::VECREDUCE_XOR ||
10953 Op.getOpcode() == ISD::VECREDUCE_FADD ||
10954 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
10955 SrcVT.getVectorElementType() == MVT::i64);
10956 if (SrcVT.isScalableVector() ||
10957 useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
10959 if (SrcVT.getVectorElementType() == MVT::i1)
10960 return LowerPredReductionToSVE(Op, DAG);
10962 switch (Op.getOpcode()) {
10963 case ISD::VECREDUCE_ADD:
10964 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
10965 case ISD::VECREDUCE_AND:
10966 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
10967 case ISD::VECREDUCE_OR:
10968 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
10969 case ISD::VECREDUCE_SMAX:
10970 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
10971 case ISD::VECREDUCE_SMIN:
10972 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
10973 case ISD::VECREDUCE_UMAX:
10974 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
10975 case ISD::VECREDUCE_UMIN:
10976 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
10977 case ISD::VECREDUCE_XOR:
10978 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
10979 case ISD::VECREDUCE_FADD:
10980 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
10981 case ISD::VECREDUCE_FMAX:
10982 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
10983 case ISD::VECREDUCE_FMIN:
10984 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
10986 llvm_unreachable("Unhandled fixed length reduction");
10990 // Lower NEON reductions.
10992 switch (Op.getOpcode()) {
10993 case ISD::VECREDUCE_ADD:
10994 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
10995 case ISD::VECREDUCE_SMAX:
10996 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
10997 case ISD::VECREDUCE_SMIN:
10998 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
10999 case ISD::VECREDUCE_UMAX:
11000 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
11001 case ISD::VECREDUCE_UMIN:
11002 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
11003 case ISD::VECREDUCE_FMAX: {
11004 return DAG.getNode(
11005 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
11006 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
11009 case ISD::VECREDUCE_FMIN: {
11010 return DAG.getNode(
11011 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
11012 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
11016 llvm_unreachable("Unhandled reduction");
11020 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
11021 SelectionDAG &DAG) const {
11022 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
11023 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
11026 // LSE has an atomic load-add instruction, but not a load-sub.
11028 MVT VT = Op.getSimpleValueType();
11029 SDValue RHS = Op.getOperand(2);
11030 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
11031 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
11032 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
11033 Op.getOperand(0), Op.getOperand(1), RHS,
11034 AN->getMemOperand());
11037 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
11038 SelectionDAG &DAG) const {
11039 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
11040 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
11043 // LSE has an atomic load-clear instruction, but not a load-and.
11045 MVT VT = Op.getSimpleValueType();
11046 SDValue RHS = Op.getOperand(2);
11047 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
11048 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
11049 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
11050 Op.getOperand(0), Op.getOperand(1), RHS,
11051 AN->getMemOperand());
11054 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
11055 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
11057 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11058 SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
11060 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
11061 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
11062 if (Subtarget->hasCustomCallingConv())
11063 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
11065 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
11066 DAG.getConstant(4, dl, MVT::i64));
11067 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
11069 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
11070 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
11071 DAG.getRegisterMask(Mask), Chain.getValue(1));
11072 // To match the actual intent better, we should read the output from X15 here
11073 // again (instead of potentially spilling it to the stack), but rereading Size
11074 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
11077 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
11078 DAG.getConstant(4, dl, MVT::i64));
11083 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
11084 SelectionDAG &DAG) const {
11085 assert(Subtarget->isTargetWindows() &&
11086 "Only Windows alloca probing supported");
11089 SDNode *Node = Op.getNode();
11090 SDValue Chain = Op.getOperand(0);
11091 SDValue Size = Op.getOperand(1);
11093 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
11094 EVT VT = Node->getValueType(0);
11096 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
11097 "no-stack-arg-probe")) {
11098 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
11099 Chain = SP.getValue(1);
11100 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
11102 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
11103 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
11104 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
11105 SDValue Ops[2] = {SP, Chain};
11106 return DAG.getMergeValues(Ops, dl);
11109 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
11111 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
11113 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
11114 Chain = SP.getValue(1);
11115 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
11117 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
11118 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
11119 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
11121 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
11122 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
11124 SDValue Ops[2] = {SP, Chain};
11125 return DAG.getMergeValues(Ops, dl);
11128 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
11129 SelectionDAG &DAG) const {
11130 EVT VT = Op.getValueType();
11131 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
11134 APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
11135 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
11139 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
11140 template <unsigned NumVecs>
11142 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
11143 AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
11144 Info.opc = ISD::INTRINSIC_VOID;
11145 // Retrieve EC from first vector argument.
11146 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
11147 ElementCount EC = VT.getVectorElementCount();
11149 // Check the assumption that all input vectors are the same type.
11150 for (unsigned I = 0; I < NumVecs; ++I)
11151 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
11154 // memVT is `NumVecs * VT`.
11155 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
11157 Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
11159 Info.align.reset();
11160 Info.flags = MachineMemOperand::MOStore;
11164 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
11165 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
11166 /// specified in the intrinsic calls.
11167 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
11169 MachineFunction &MF,
11170 unsigned Intrinsic) const {
11171 auto &DL = I.getModule()->getDataLayout();
11172 switch (Intrinsic) {
11173 case Intrinsic::aarch64_sve_st2:
11174 return setInfoSVEStN<2>(*this, DL, Info, I);
11175 case Intrinsic::aarch64_sve_st3:
11176 return setInfoSVEStN<3>(*this, DL, Info, I);
11177 case Intrinsic::aarch64_sve_st4:
11178 return setInfoSVEStN<4>(*this, DL, Info, I);
11179 case Intrinsic::aarch64_neon_ld2:
11180 case Intrinsic::aarch64_neon_ld3:
11181 case Intrinsic::aarch64_neon_ld4:
11182 case Intrinsic::aarch64_neon_ld1x2:
11183 case Intrinsic::aarch64_neon_ld1x3:
11184 case Intrinsic::aarch64_neon_ld1x4:
11185 case Intrinsic::aarch64_neon_ld2lane:
11186 case Intrinsic::aarch64_neon_ld3lane:
11187 case Intrinsic::aarch64_neon_ld4lane:
11188 case Intrinsic::aarch64_neon_ld2r:
11189 case Intrinsic::aarch64_neon_ld3r:
11190 case Intrinsic::aarch64_neon_ld4r: {
11191 Info.opc = ISD::INTRINSIC_W_CHAIN;
11192 // Conservatively set memVT to the entire set of vectors loaded.
11193 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
11194 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
11195 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
11197 Info.align.reset();
11198 // volatile loads with NEON intrinsics not supported
11199 Info.flags = MachineMemOperand::MOLoad;
11202 case Intrinsic::aarch64_neon_st2:
11203 case Intrinsic::aarch64_neon_st3:
11204 case Intrinsic::aarch64_neon_st4:
11205 case Intrinsic::aarch64_neon_st1x2:
11206 case Intrinsic::aarch64_neon_st1x3:
11207 case Intrinsic::aarch64_neon_st1x4:
11208 case Intrinsic::aarch64_neon_st2lane:
11209 case Intrinsic::aarch64_neon_st3lane:
11210 case Intrinsic::aarch64_neon_st4lane: {
11211 Info.opc = ISD::INTRINSIC_VOID;
11212 // Conservatively set memVT to the entire set of vectors stored.
11213 unsigned NumElts = 0;
11214 for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
11215 Type *ArgTy = I.getArgOperand(ArgI)->getType();
11216 if (!ArgTy->isVectorTy())
11218 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
11220 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
11221 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
11223 Info.align.reset();
11224 // volatile stores with NEON intrinsics not supported
11225 Info.flags = MachineMemOperand::MOStore;
11228 case Intrinsic::aarch64_ldaxr:
11229 case Intrinsic::aarch64_ldxr: {
11230 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
11231 Info.opc = ISD::INTRINSIC_W_CHAIN;
11232 Info.memVT = MVT::getVT(PtrTy->getElementType());
11233 Info.ptrVal = I.getArgOperand(0);
11235 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11236 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
11239 case Intrinsic::aarch64_stlxr:
11240 case Intrinsic::aarch64_stxr: {
11241 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
11242 Info.opc = ISD::INTRINSIC_W_CHAIN;
11243 Info.memVT = MVT::getVT(PtrTy->getElementType());
11244 Info.ptrVal = I.getArgOperand(1);
11246 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11247 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
11250 case Intrinsic::aarch64_ldaxp:
11251 case Intrinsic::aarch64_ldxp:
11252 Info.opc = ISD::INTRINSIC_W_CHAIN;
11253 Info.memVT = MVT::i128;
11254 Info.ptrVal = I.getArgOperand(0);
11256 Info.align = Align(16);
11257 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
11259 case Intrinsic::aarch64_stlxp:
11260 case Intrinsic::aarch64_stxp:
11261 Info.opc = ISD::INTRINSIC_W_CHAIN;
11262 Info.memVT = MVT::i128;
11263 Info.ptrVal = I.getArgOperand(2);
11265 Info.align = Align(16);
11266 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
11268 case Intrinsic::aarch64_sve_ldnt1: {
11269 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
11270 Info.opc = ISD::INTRINSIC_W_CHAIN;
11271 Info.memVT = MVT::getVT(I.getType());
11272 Info.ptrVal = I.getArgOperand(1);
11274 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11275 Info.flags = MachineMemOperand::MOLoad;
11276 if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
11277 Info.flags |= MachineMemOperand::MONonTemporal;
11280 case Intrinsic::aarch64_sve_stnt1: {
11281 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
11282 Info.opc = ISD::INTRINSIC_W_CHAIN;
11283 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
11284 Info.ptrVal = I.getArgOperand(2);
11286 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11287 Info.flags = MachineMemOperand::MOStore;
11288 if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
11289 Info.flags |= MachineMemOperand::MONonTemporal;
11299 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
11300 ISD::LoadExtType ExtTy,
11302 // TODO: This may be worth removing. Check regression tests for diffs.
11303 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
11306 // If we're reducing the load width in order to avoid having to use an extra
11307 // instruction to do extension then it's probably a good idea.
11308 if (ExtTy != ISD::NON_EXTLOAD)
11310 // Don't reduce load width if it would prevent us from combining a shift into
11312 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
11314 const SDValue &Base = Mem->getBasePtr();
11315 if (Base.getOpcode() == ISD::ADD &&
11316 Base.getOperand(1).getOpcode() == ISD::SHL &&
11317 Base.getOperand(1).hasOneUse() &&
11318 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
11319 // The shift can be combined if it matches the size of the value being
11320 // loaded (and so reducing the width would make it not match).
11321 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
11322 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
11323 if (ShiftAmount == Log2_32(LoadBytes))
11326 // We have no reason to disallow reducing the load width, so allow it.
11330 // Truncations from 64-bit GPR to 32-bit GPR is free.
11331 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
11332 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11334 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
11335 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
11336 return NumBits1 > NumBits2;
11338 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
11339 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
11341 uint64_t NumBits1 = VT1.getFixedSizeInBits();
11342 uint64_t NumBits2 = VT2.getFixedSizeInBits();
11343 return NumBits1 > NumBits2;
11346 /// Check if it is profitable to hoist instruction in then/else to if.
11347 /// Not profitable if I and it's user can form a FMA instruction
11348 /// because we prefer FMSUB/FMADD.
11349 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
11350 if (I->getOpcode() != Instruction::FMul)
11353 if (!I->hasOneUse())
11356 Instruction *User = I->user_back();
11359 !(User->getOpcode() == Instruction::FSub ||
11360 User->getOpcode() == Instruction::FAdd))
11363 const TargetOptions &Options = getTargetMachine().Options;
11364 const Function *F = I->getFunction();
11365 const DataLayout &DL = F->getParent()->getDataLayout();
11366 Type *Ty = User->getOperand(0)->getType();
11368 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
11369 isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
11370 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
11371 Options.UnsafeFPMath));
11374 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
11376 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
11377 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11379 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
11380 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
11381 return NumBits1 == 32 && NumBits2 == 64;
11383 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
11384 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
11386 unsigned NumBits1 = VT1.getSizeInBits();
11387 unsigned NumBits2 = VT2.getSizeInBits();
11388 return NumBits1 == 32 && NumBits2 == 64;
11391 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
11392 EVT VT1 = Val.getValueType();
11393 if (isZExtFree(VT1, VT2)) {
11397 if (Val.getOpcode() != ISD::LOAD)
11400 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
11401 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
11402 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
11403 VT1.getSizeInBits() <= 32);
11406 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
11407 if (isa<FPExtInst>(Ext))
11410 // Vector types are not free.
11411 if (Ext->getType()->isVectorTy())
11414 for (const Use &U : Ext->uses()) {
11415 // The extension is free if we can fold it with a left shift in an
11416 // addressing mode or an arithmetic operation: add, sub, and cmp.
11418 // Is there a shift?
11419 const Instruction *Instr = cast<Instruction>(U.getUser());
11421 // Is this a constant shift?
11422 switch (Instr->getOpcode()) {
11423 case Instruction::Shl:
11424 if (!isa<ConstantInt>(Instr->getOperand(1)))
11427 case Instruction::GetElementPtr: {
11428 gep_type_iterator GTI = gep_type_begin(Instr);
11429 auto &DL = Ext->getModule()->getDataLayout();
11430 std::advance(GTI, U.getOperandNo()-1);
11431 Type *IdxTy = GTI.getIndexedType();
11432 // This extension will end up with a shift because of the scaling factor.
11433 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
11434 // Get the shift amount based on the scaling factor:
11435 // log2(sizeof(IdxTy)) - log2(8).
11436 uint64_t ShiftAmt =
11437 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
11438 // Is the constant foldable in the shift of the addressing mode?
11439 // I.e., shift amount is between 1 and 4 inclusive.
11440 if (ShiftAmt == 0 || ShiftAmt > 4)
11444 case Instruction::Trunc:
11445 // Check if this is a noop.
11446 // trunc(sext ty1 to ty2) to ty1.
11447 if (Instr->getType() == Ext->getOperand(0)->getType())
11454 // At this point we can use the bfm family, so this extension is free
11460 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower
11461 /// or upper half of the vector elements.
11462 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
11463 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
11464 auto *FullTy = FullV->getType();
11465 auto *HalfTy = HalfV->getType();
11466 return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
11467 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
11470 auto extractHalf = [](Value *FullV, Value *HalfV) {
11471 auto *FullVT = cast<FixedVectorType>(FullV->getType());
11472 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
11473 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
11476 ArrayRef<int> M1, M2;
11477 Value *S1Op1, *S2Op1;
11478 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
11479 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
11482 // Check that the operands are half as wide as the result and we extract
11483 // half of the elements of the input vectors.
11484 if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
11485 !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
11488 // Check the mask extracts either the lower or upper half of vector
11492 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
11493 if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
11494 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
11495 M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
11501 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
11502 /// of the vector elements.
11503 static bool areExtractExts(Value *Ext1, Value *Ext2) {
11504 auto areExtDoubled = [](Instruction *Ext) {
11505 return Ext->getType()->getScalarSizeInBits() ==
11506 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
11509 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
11510 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
11511 !areExtDoubled(cast<Instruction>(Ext1)) ||
11512 !areExtDoubled(cast<Instruction>(Ext2)))
11518 /// Check if Op could be used with vmull_high_p64 intrinsic.
11519 static bool isOperandOfVmullHighP64(Value *Op) {
11520 Value *VectorOperand = nullptr;
11521 ConstantInt *ElementIndex = nullptr;
11522 return match(Op, m_ExtractElt(m_Value(VectorOperand),
11523 m_ConstantInt(ElementIndex))) &&
11524 ElementIndex->getValue() == 1 &&
11525 isa<FixedVectorType>(VectorOperand->getType()) &&
11526 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
11529 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
11530 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
11531 return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
11534 /// Check if sinking \p I's operands to I's basic block is profitable, because
11535 /// the operands can be folded into a target instruction, e.g.
11536 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
11537 bool AArch64TargetLowering::shouldSinkOperands(
11538 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
11539 if (!I->getType()->isVectorTy())
11542 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
11543 switch (II->getIntrinsicID()) {
11544 case Intrinsic::aarch64_neon_umull:
11545 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
11547 Ops.push_back(&II->getOperandUse(0));
11548 Ops.push_back(&II->getOperandUse(1));
11551 case Intrinsic::aarch64_neon_pmull64:
11552 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
11553 II->getArgOperand(1)))
11555 Ops.push_back(&II->getArgOperandUse(0));
11556 Ops.push_back(&II->getArgOperandUse(1));
11564 switch (I->getOpcode()) {
11565 case Instruction::Sub:
11566 case Instruction::Add: {
11567 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
11570 // If the exts' operands extract either the lower or upper elements, we
11571 // can sink them too.
11572 auto Ext1 = cast<Instruction>(I->getOperand(0));
11573 auto Ext2 = cast<Instruction>(I->getOperand(1));
11574 if (areExtractShuffleVectors(Ext1, Ext2)) {
11575 Ops.push_back(&Ext1->getOperandUse(0));
11576 Ops.push_back(&Ext2->getOperandUse(0));
11579 Ops.push_back(&I->getOperandUse(0));
11580 Ops.push_back(&I->getOperandUse(1));
11584 case Instruction::Mul: {
11585 bool IsProfitable = false;
11586 for (auto &Op : I->operands()) {
11587 // Make sure we are not already sinking this operand
11588 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
11591 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
11592 if (!Shuffle || !Shuffle->isZeroEltSplat())
11595 Value *ShuffleOperand = Shuffle->getOperand(0);
11596 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
11600 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
11604 ConstantInt *ElementConstant =
11605 dyn_cast<ConstantInt>(Insert->getOperand(2));
11606 // Check that the insertelement is inserting into element 0
11607 if (!ElementConstant || ElementConstant->getZExtValue() != 0)
11610 unsigned Opcode = OperandInstr->getOpcode();
11611 if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
11614 Ops.push_back(&Shuffle->getOperandUse(0));
11615 Ops.push_back(&Op);
11616 IsProfitable = true;
11619 return IsProfitable;
11627 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
11628 Align &RequiredAligment) const {
11629 if (!LoadedType.isSimple() ||
11630 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
11632 // Cyclone supports unaligned accesses.
11633 RequiredAligment = Align(1);
11634 unsigned NumBits = LoadedType.getSizeInBits();
11635 return NumBits == 32 || NumBits == 64;
11638 /// A helper function for determining the number of interleaved accesses we
11639 /// will generate when lowering accesses of the given type.
11641 AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
11642 const DataLayout &DL) const {
11643 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
11646 MachineMemOperand::Flags
11647 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
11648 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
11649 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
11650 return MOStridedAccess;
11651 return MachineMemOperand::MONone;
11654 bool AArch64TargetLowering::isLegalInterleavedAccessType(
11655 VectorType *VecTy, const DataLayout &DL) const {
11657 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
11658 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
11660 // Ensure the number of vector elements is greater than 1.
11661 if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
11664 // Ensure the element type is legal.
11665 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
11668 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
11669 // 128 will be split into multiple interleaved accesses.
11670 return VecSize == 64 || VecSize % 128 == 0;
11673 /// Lower an interleaved load into a ldN intrinsic.
11675 /// E.g. Lower an interleaved load (Factor = 2):
11676 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
11677 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
11678 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
11681 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
11682 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
11683 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
11684 bool AArch64TargetLowering::lowerInterleavedLoad(
11685 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
11686 ArrayRef<unsigned> Indices, unsigned Factor) const {
11687 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11688 "Invalid interleave factor");
11689 assert(!Shuffles.empty() && "Empty shufflevector input");
11690 assert(Shuffles.size() == Indices.size() &&
11691 "Unmatched number of shufflevectors and indices");
11693 const DataLayout &DL = LI->getModule()->getDataLayout();
11695 VectorType *VTy = Shuffles[0]->getType();
11697 // Skip if we do not have NEON and skip illegal vector types. We can
11698 // "legalize" wide vector types into multiple interleaved accesses as long as
11699 // the vector types are divisible by 128.
11700 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
11703 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
11705 auto *FVTy = cast<FixedVectorType>(VTy);
11707 // A pointer vector can not be the return type of the ldN intrinsics. Need to
11708 // load integer vectors first and then convert to pointer vectors.
11709 Type *EltTy = FVTy->getElementType();
11710 if (EltTy->isPointerTy())
11712 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
11714 IRBuilder<> Builder(LI);
11716 // The base address of the load.
11717 Value *BaseAddr = LI->getPointerOperand();
11719 if (NumLoads > 1) {
11720 // If we're going to generate more than one load, reset the sub-vector type
11721 // to something legal.
11722 FVTy = FixedVectorType::get(FVTy->getElementType(),
11723 FVTy->getNumElements() / NumLoads);
11725 // We will compute the pointer operand of each load from the original base
11726 // address using GEPs. Cast the base address to a pointer to the scalar
11728 BaseAddr = Builder.CreateBitCast(
11730 FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
11733 Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
11734 Type *Tys[2] = {FVTy, PtrTy};
11735 static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
11736 Intrinsic::aarch64_neon_ld3,
11737 Intrinsic::aarch64_neon_ld4};
11738 Function *LdNFunc =
11739 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
11741 // Holds sub-vectors extracted from the load intrinsic return values. The
11742 // sub-vectors are associated with the shufflevector instructions they will
11744 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
11746 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
11748 // If we're generating more than one load, compute the base address of
11749 // subsequent loads as an offset from the previous.
11751 BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
11752 FVTy->getNumElements() * Factor);
11754 CallInst *LdN = Builder.CreateCall(
11755 LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
11757 // Extract and store the sub-vectors returned by the load intrinsic.
11758 for (unsigned i = 0; i < Shuffles.size(); i++) {
11759 ShuffleVectorInst *SVI = Shuffles[i];
11760 unsigned Index = Indices[i];
11762 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
11764 // Convert the integer vector to pointer vector if the element is pointer.
11765 if (EltTy->isPointerTy())
11766 SubVec = Builder.CreateIntToPtr(
11767 SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
11768 FVTy->getNumElements()));
11769 SubVecs[SVI].push_back(SubVec);
11773 // Replace uses of the shufflevector instructions with the sub-vectors
11774 // returned by the load intrinsic. If a shufflevector instruction is
11775 // associated with more than one sub-vector, those sub-vectors will be
11776 // concatenated into a single wide vector.
11777 for (ShuffleVectorInst *SVI : Shuffles) {
11778 auto &SubVec = SubVecs[SVI];
11780 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
11781 SVI->replaceAllUsesWith(WideVec);
11787 /// Lower an interleaved store into a stN intrinsic.
11789 /// E.g. Lower an interleaved store (Factor = 3):
11790 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
11791 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
11792 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
11795 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
11796 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
11797 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
11798 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11800 /// Note that the new shufflevectors will be removed and we'll only generate one
11801 /// st3 instruction in CodeGen.
11803 /// Example for a more general valid mask (Factor 3). Lower:
11804 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
11805 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
11806 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
11809 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
11810 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
11811 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
11812 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11813 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
11814 ShuffleVectorInst *SVI,
11815 unsigned Factor) const {
11816 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11817 "Invalid interleave factor");
11819 auto *VecTy = cast<FixedVectorType>(SVI->getType());
11820 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
11822 unsigned LaneLen = VecTy->getNumElements() / Factor;
11823 Type *EltTy = VecTy->getElementType();
11824 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
11826 const DataLayout &DL = SI->getModule()->getDataLayout();
11828 // Skip if we do not have NEON and skip illegal vector types. We can
11829 // "legalize" wide vector types into multiple interleaved accesses as long as
11830 // the vector types are divisible by 128.
11831 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
11834 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
11836 Value *Op0 = SVI->getOperand(0);
11837 Value *Op1 = SVI->getOperand(1);
11838 IRBuilder<> Builder(SI);
11840 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
11841 // vectors to integer vectors.
11842 if (EltTy->isPointerTy()) {
11843 Type *IntTy = DL.getIntPtrType(EltTy);
11844 unsigned NumOpElts =
11845 cast<FixedVectorType>(Op0->getType())->getNumElements();
11847 // Convert to the corresponding integer vector.
11848 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
11849 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
11850 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
11852 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
11855 // The base address of the store.
11856 Value *BaseAddr = SI->getPointerOperand();
11858 if (NumStores > 1) {
11859 // If we're going to generate more than one store, reset the lane length
11860 // and sub-vector type to something legal.
11861 LaneLen /= NumStores;
11862 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
11864 // We will compute the pointer operand of each store from the original base
11865 // address using GEPs. Cast the base address to a pointer to the scalar
11867 BaseAddr = Builder.CreateBitCast(
11869 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
11872 auto Mask = SVI->getShuffleMask();
11874 Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
11875 Type *Tys[2] = {SubVecTy, PtrTy};
11876 static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
11877 Intrinsic::aarch64_neon_st3,
11878 Intrinsic::aarch64_neon_st4};
11879 Function *StNFunc =
11880 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
11882 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
11884 SmallVector<Value *, 5> Ops;
11886 // Split the shufflevector operands into sub vectors for the new stN call.
11887 for (unsigned i = 0; i < Factor; i++) {
11888 unsigned IdxI = StoreCount * LaneLen * Factor + i;
11889 if (Mask[IdxI] >= 0) {
11890 Ops.push_back(Builder.CreateShuffleVector(
11891 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
11893 unsigned StartMask = 0;
11894 for (unsigned j = 1; j < LaneLen; j++) {
11895 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
11896 if (Mask[IdxJ * Factor + IdxI] >= 0) {
11897 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
11901 // Note: Filling undef gaps with random elements is ok, since
11902 // those elements were being written anyway (with undefs).
11903 // In the case of all undefs we're defaulting to using elems from 0
11904 // Note: StartMask cannot be negative, it's checked in
11905 // isReInterleaveMask
11906 Ops.push_back(Builder.CreateShuffleVector(
11907 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
11911 // If we generating more than one store, we compute the base address of
11912 // subsequent stores as an offset from the previous.
11913 if (StoreCount > 0)
11914 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
11915 BaseAddr, LaneLen * Factor);
11917 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
11918 Builder.CreateCall(StNFunc, Ops);
11923 // Lower an SVE structured load intrinsic returning a tuple type to target
11924 // specific intrinsic taking the same input but returning a multi-result value
11925 // of the split tuple type.
11927 // E.g. Lowering an LD3:
11929 // call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
11930 // <vscale x 4 x i1> %pred,
11931 // <vscale x 4 x i32>* %addr)
11935 // t0: ch = EntryToken
11936 // t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
11937 // t4: i64,ch = CopyFromReg t0, Register:i64 %1
11938 // t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
11939 // t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
11941 // This is called pre-legalization to avoid widening/splitting issues with
11942 // non-power-of-2 tuple types used for LD3, such as nxv12i32.
11943 SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
11944 ArrayRef<SDValue> LoadOps,
11945 EVT VT, SelectionDAG &DAG,
11946 const SDLoc &DL) const {
11947 assert(VT.isScalableVector() && "Can only lower scalable vectors");
11949 unsigned N, Opcode;
11950 static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
11951 {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
11952 {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
11953 {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
11955 std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
11956 assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
11957 "invalid tuple vector type!");
11960 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
11961 VT.getVectorElementCount().divideCoefficientBy(N));
11962 assert(isTypeLegal(SplitVT));
11964 SmallVector<EVT, 5> VTs(N, SplitVT);
11965 VTs.push_back(MVT::Other); // Chain
11966 SDVTList NodeTys = DAG.getVTList(VTs);
11968 SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
11969 SmallVector<SDValue, 4> PseudoLoadOps;
11970 for (unsigned I = 0; I < N; ++I)
11971 PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
11972 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
11975 EVT AArch64TargetLowering::getOptimalMemOpType(
11976 const MemOp &Op, const AttributeList &FuncAttributes) const {
11977 bool CanImplicitFloat =
11978 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
11979 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
11980 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
11981 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
11982 // taken one instruction to materialize the v2i64 zero and one store (with
11983 // restrictive addressing mode). Just do i64 stores.
11984 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
11985 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
11986 if (Op.isAligned(AlignCheck))
11989 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
11990 MachineMemOperand::MONone, &Fast) &&
11994 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
11995 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
11997 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
11999 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
12001 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
12006 LLT AArch64TargetLowering::getOptimalMemOpLLT(
12007 const MemOp &Op, const AttributeList &FuncAttributes) const {
12008 bool CanImplicitFloat =
12009 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
12010 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
12011 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
12012 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
12013 // taken one instruction to materialize the v2i64 zero and one store (with
12014 // restrictive addressing mode). Just do i64 stores.
12015 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
12016 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
12017 if (Op.isAligned(AlignCheck))
12020 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
12021 MachineMemOperand::MONone, &Fast) &&
12025 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
12026 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
12027 return LLT::fixed_vector(2, 64);
12028 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
12029 return LLT::scalar(128);
12030 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
12031 return LLT::scalar(64);
12032 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
12033 return LLT::scalar(32);
12037 // 12-bit optionally shifted immediates are legal for adds.
12038 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
12039 if (Immed == std::numeric_limits<int64_t>::min()) {
12040 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
12041 << ": avoid UB for INT64_MIN\n");
12044 // Same encoding for add/sub, just flip the sign.
12045 Immed = std::abs(Immed);
12046 bool IsLegal = ((Immed >> 12) == 0 ||
12047 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
12048 LLVM_DEBUG(dbgs() << "Is " << Immed
12049 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
12053 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
12054 // immediates is the same as for an add or a sub.
12055 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
12056 return isLegalAddImmediate(Immed);
12059 /// isLegalAddressingMode - Return true if the addressing mode represented
12060 /// by AM is legal for this target, for a load/store of the specified type.
12061 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
12062 const AddrMode &AM, Type *Ty,
12063 unsigned AS, Instruction *I) const {
12064 // AArch64 has five basic addressing modes:
12066 // reg + 9-bit signed offset
12067 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
12069 // reg + SIZE_IN_BYTES * reg
12071 // No global is ever allowed as a base.
12075 // No reg+reg+imm addressing.
12076 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
12079 // FIXME: Update this method to support scalable addressing modes.
12080 if (isa<ScalableVectorType>(Ty)) {
12081 uint64_t VecElemNumBytes =
12082 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
12083 return AM.HasBaseReg && !AM.BaseOffs &&
12084 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
12087 // check reg + imm case:
12088 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
12089 uint64_t NumBytes = 0;
12090 if (Ty->isSized()) {
12091 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
12092 NumBytes = NumBits / 8;
12093 if (!isPowerOf2_64(NumBits))
12098 int64_t Offset = AM.BaseOffs;
12100 // 9-bit signed offset
12101 if (isInt<9>(Offset))
12104 // 12-bit unsigned offset
12105 unsigned shift = Log2_64(NumBytes);
12106 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
12107 // Must be a multiple of NumBytes (NumBytes is a power of 2)
12108 (Offset >> shift) << shift == Offset)
12113 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
12115 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
12118 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
12119 // Consider splitting large offset of struct or array.
12123 InstructionCost AArch64TargetLowering::getScalingFactorCost(
12124 const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const {
12125 // Scaling factors are not free at all.
12126 // Operands | Rt Latency
12127 // -------------------------------------------
12128 // Rt, [Xn, Xm] | 4
12129 // -------------------------------------------
12130 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
12131 // Rt, [Xn, Wm, <extend> #imm] |
12132 if (isLegalAddressingMode(DL, AM, Ty, AS))
12133 // Scale represents reg2 * scale, thus account for 1 if
12134 // it is not equal to 0 or 1.
12135 return AM.Scale != 0 && AM.Scale != 1;
12139 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
12140 const MachineFunction &MF, EVT VT) const {
12141 VT = VT.getScalarType();
12143 if (!VT.isSimple())
12146 switch (VT.getSimpleVT().SimpleTy) {
12148 return Subtarget->hasFullFP16();
12159 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
12161 switch (Ty->getScalarType()->getTypeID()) {
12162 case Type::FloatTyID:
12163 case Type::DoubleTyID:
12170 bool AArch64TargetLowering::generateFMAsInMachineCombiner(
12171 EVT VT, CodeGenOpt::Level OptLevel) const {
12172 return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector();
12176 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
12177 // LR is a callee-save register, but we must treat it as clobbered by any call
12178 // site. Hence we include LR in the scratch registers, which are in turn added
12179 // as implicit-defs for stackmaps and patchpoints.
12180 static const MCPhysReg ScratchRegs[] = {
12181 AArch64::X16, AArch64::X17, AArch64::LR, 0
12183 return ScratchRegs;
12187 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
12188 CombineLevel Level) const {
12189 N = N->getOperand(0).getNode();
12190 EVT VT = N->getValueType(0);
12191 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
12192 // it with shift to let it be lowered to UBFX.
12193 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
12194 isa<ConstantSDNode>(N->getOperand(1))) {
12195 uint64_t TruncMask = N->getConstantOperandVal(1);
12196 if (isMask_64(TruncMask) &&
12197 N->getOperand(0).getOpcode() == ISD::SRL &&
12198 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
12204 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
12206 assert(Ty->isIntegerTy());
12208 unsigned BitSize = Ty->getPrimitiveSizeInBits();
12212 int64_t Val = Imm.getSExtValue();
12213 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
12216 if ((int64_t)Val < 0)
12219 Val &= (1LL << 32) - 1;
12221 unsigned LZ = countLeadingZeros((uint64_t)Val);
12222 unsigned Shift = (63 - LZ) / 16;
12223 // MOVZ is free so return true for one or fewer MOVK.
12227 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
12228 unsigned Index) const {
12229 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
12232 return (Index == 0 || Index == ResVT.getVectorNumElements());
12235 /// Turn vector tests of the signbit in the form of:
12236 /// xor (sra X, elt_size(X)-1), -1
12239 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
12240 const AArch64Subtarget *Subtarget) {
12241 EVT VT = N->getValueType(0);
12242 if (!Subtarget->hasNEON() || !VT.isVector())
12245 // There must be a shift right algebraic before the xor, and the xor must be a
12246 // 'not' operation.
12247 SDValue Shift = N->getOperand(0);
12248 SDValue Ones = N->getOperand(1);
12249 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
12250 !ISD::isBuildVectorAllOnes(Ones.getNode()))
12253 // The shift should be smearing the sign bit across each vector element.
12254 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
12255 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
12256 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
12259 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
12262 // Given a vecreduce_add node, detect the below pattern and convert it to the
12263 // node sequence with UABDL, [S|U]ADB and UADDLP.
12265 // i32 vecreduce_add(
12268 // v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
12269 // =================>
12270 // i32 vecreduce_add(
12274 // v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
12276 // v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
12277 static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
12278 SelectionDAG &DAG) {
12279 // Assumed i32 vecreduce_add
12280 if (N->getValueType(0) != MVT::i32)
12283 SDValue VecReduceOp0 = N->getOperand(0);
12284 unsigned Opcode = VecReduceOp0.getOpcode();
12285 // Assumed v16i32 abs
12286 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
12289 SDValue ABS = VecReduceOp0;
12290 // Assumed v16i32 sub
12291 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
12292 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
12295 SDValue SUB = ABS->getOperand(0);
12296 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
12297 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
12298 // Assumed v16i32 type
12299 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
12300 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
12303 // Assumed zext or sext
12304 bool IsZExt = false;
12305 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
12307 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
12312 SDValue EXT0 = SUB->getOperand(0);
12313 SDValue EXT1 = SUB->getOperand(1);
12314 // Assumed zext's operand has v16i8 type
12315 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
12316 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
12319 // Pattern is dectected. Let's convert it to sequence of nodes.
12322 // First, create the node pattern of UABD/SABD.
12323 SDValue UABDHigh8Op0 =
12324 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
12325 DAG.getConstant(8, DL, MVT::i64));
12326 SDValue UABDHigh8Op1 =
12327 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
12328 DAG.getConstant(8, DL, MVT::i64));
12329 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
12330 UABDHigh8Op0, UABDHigh8Op1);
12331 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
12333 // Second, create the node pattern of UABAL.
12334 SDValue UABDLo8Op0 =
12335 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
12336 DAG.getConstant(0, DL, MVT::i64));
12337 SDValue UABDLo8Op1 =
12338 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
12339 DAG.getConstant(0, DL, MVT::i64));
12340 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
12341 UABDLo8Op0, UABDLo8Op1);
12342 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
12343 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
12345 // Third, create the node of UADDLP.
12346 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
12348 // Fourth, create the node of VECREDUCE_ADD.
12349 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
12352 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
12353 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
12354 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
12355 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
12356 const AArch64Subtarget *ST) {
12357 if (!ST->hasDotProd())
12358 return performVecReduceAddCombineWithUADDLP(N, DAG);
12360 SDValue Op0 = N->getOperand(0);
12361 if (N->getValueType(0) != MVT::i32 ||
12362 Op0.getValueType().getVectorElementType() != MVT::i32)
12365 unsigned ExtOpcode = Op0.getOpcode();
12368 if (ExtOpcode == ISD::MUL) {
12369 A = Op0.getOperand(0);
12370 B = Op0.getOperand(1);
12371 if (A.getOpcode() != B.getOpcode() ||
12372 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
12374 ExtOpcode = A.getOpcode();
12376 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
12379 EVT Op0VT = A.getOperand(0).getValueType();
12380 if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8)
12384 // For non-mla reductions B can be set to 1. For MLA we take the operand of
12387 B = DAG.getConstant(1, DL, Op0VT);
12389 B = B.getOperand(0);
12392 DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32);
12394 (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
12395 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
12396 A.getOperand(0), B);
12397 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
12400 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
12401 TargetLowering::DAGCombinerInfo &DCI,
12402 const AArch64Subtarget *Subtarget) {
12403 if (DCI.isBeforeLegalizeOps())
12406 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
12410 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
12412 SmallVectorImpl<SDNode *> &Created) const {
12413 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
12414 if (isIntDivCheap(N->getValueType(0), Attr))
12415 return SDValue(N,0); // Lower SDIV as SDIV
12417 // fold (sdiv X, pow2)
12418 EVT VT = N->getValueType(0);
12419 if ((VT != MVT::i32 && VT != MVT::i64) ||
12420 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
12424 SDValue N0 = N->getOperand(0);
12425 unsigned Lg2 = Divisor.countTrailingZeros();
12426 SDValue Zero = DAG.getConstant(0, DL, VT);
12427 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
12429 // Add (N0 < 0) ? Pow2 - 1 : 0;
12431 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
12432 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
12433 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
12435 Created.push_back(Cmp.getNode());
12436 Created.push_back(Add.getNode());
12437 Created.push_back(CSel.getNode());
12441 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
12443 // If we're dividing by a positive value, we're done. Otherwise, we must
12444 // negate the result.
12445 if (Divisor.isNonNegative())
12448 Created.push_back(SRA.getNode());
12449 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
12452 static bool IsSVECntIntrinsic(SDValue S) {
12453 switch(getIntrinsicID(S.getNode())) {
12456 case Intrinsic::aarch64_sve_cntb:
12457 case Intrinsic::aarch64_sve_cnth:
12458 case Intrinsic::aarch64_sve_cntw:
12459 case Intrinsic::aarch64_sve_cntd:
12465 /// Calculates what the pre-extend type is, based on the extension
12466 /// operation node provided by \p Extend.
12468 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
12469 /// pre-extend type is pulled directly from the operand, while other extend
12470 /// operations need a bit more inspection to get this information.
12472 /// \param Extend The SDNode from the DAG that represents the extend operation
12473 /// \param DAG The SelectionDAG hosting the \p Extend node
12475 /// \returns The type representing the \p Extend source type, or \p MVT::Other
12476 /// if no valid type can be determined
12477 static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
12478 switch (Extend.getOpcode()) {
12479 case ISD::SIGN_EXTEND:
12480 case ISD::ZERO_EXTEND:
12481 return Extend.getOperand(0).getValueType();
12482 case ISD::AssertSext:
12483 case ISD::AssertZext:
12484 case ISD::SIGN_EXTEND_INREG: {
12485 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
12488 return TypeNode->getVT();
12491 ConstantSDNode *Constant =
12492 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
12496 uint32_t Mask = Constant->getZExtValue();
12498 if (Mask == UCHAR_MAX)
12500 else if (Mask == USHRT_MAX)
12502 else if (Mask == UINT_MAX)
12511 llvm_unreachable("Code path unhandled in calculatePreExtendType!");
12514 /// Combines a dup(sext/zext) node pattern into sext/zext(dup)
12515 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
12516 static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
12517 SelectionDAG &DAG) {
12519 ShuffleVectorSDNode *ShuffleNode =
12520 dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
12524 // Ensuring the mask is zero before continuing
12525 if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
12528 SDValue InsertVectorElt = VectorShuffle.getOperand(0);
12530 if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
12533 SDValue InsertLane = InsertVectorElt.getOperand(2);
12534 ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
12535 // Ensures the insert is inserting into lane 0
12536 if (!Constant || Constant->getZExtValue() != 0)
12539 SDValue Extend = InsertVectorElt.getOperand(1);
12540 unsigned ExtendOpcode = Extend.getOpcode();
12542 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
12543 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
12544 ExtendOpcode == ISD::AssertSext;
12545 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
12546 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
12549 EVT TargetType = VectorShuffle.getValueType();
12550 EVT PreExtendType = calculatePreExtendType(Extend, DAG);
12552 if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
12553 TargetType != MVT::v2i64) ||
12554 (PreExtendType == MVT::Other))
12557 // Restrict valid pre-extend data type
12558 if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
12559 PreExtendType != MVT::i32)
12562 EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
12564 if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
12567 if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
12570 SDLoc DL(VectorShuffle);
12572 SDValue InsertVectorNode = DAG.getNode(
12573 InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
12574 DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
12575 DAG.getConstant(0, DL, MVT::i64));
12577 std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue());
12579 SDValue VectorShuffleNode =
12580 DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
12581 DAG.getUNDEF(PreExtendVT), ShuffleMask);
12583 SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
12584 DL, TargetType, VectorShuffleNode);
12589 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
12590 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
12591 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
12592 // If the value type isn't a vector, none of the operands are going to be dups
12593 if (!Mul->getValueType(0).isVector())
12596 SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
12597 SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
12599 // Neither operands have been changed, don't make any further changes
12604 return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
12605 Op0 ? Op0 : Mul->getOperand(0),
12606 Op1 ? Op1 : Mul->getOperand(1));
12609 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
12610 TargetLowering::DAGCombinerInfo &DCI,
12611 const AArch64Subtarget *Subtarget) {
12613 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
12616 if (DCI.isBeforeLegalizeOps())
12619 // The below optimizations require a constant RHS.
12620 if (!isa<ConstantSDNode>(N->getOperand(1)))
12623 SDValue N0 = N->getOperand(0);
12624 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
12625 const APInt &ConstValue = C->getAPIntValue();
12627 // Allow the scaling to be folded into the `cnt` instruction by preventing
12628 // the scaling to be obscured here. This makes it easier to pattern match.
12629 if (IsSVECntIntrinsic(N0) ||
12630 (N0->getOpcode() == ISD::TRUNCATE &&
12631 (IsSVECntIntrinsic(N0->getOperand(0)))))
12632 if (ConstValue.sge(1) && ConstValue.sle(16))
12635 // Multiplication of a power of two plus/minus one can be done more
12636 // cheaply as as shift+add/sub. For now, this is true unilaterally. If
12637 // future CPUs have a cheaper MADD instruction, this may need to be
12638 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
12639 // 64-bit is 5 cycles, so this is always a win.
12640 // More aggressively, some multiplications N0 * C can be lowered to
12641 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
12642 // e.g. 6=3*2=(2+1)*2.
12643 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
12644 // which equals to (1+2)*16-(1+2).
12646 // TrailingZeroes is used to test if the mul can be lowered to
12647 // shift+add+shift.
12648 unsigned TrailingZeroes = ConstValue.countTrailingZeros();
12649 if (TrailingZeroes) {
12650 // Conservatively do not lower to shift+add+shift if the mul might be
12651 // folded into smul or umul.
12652 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
12653 isZeroExtended(N0.getNode(), DAG)))
12655 // Conservatively do not lower to shift+add+shift if the mul might be
12656 // folded into madd or msub.
12657 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
12658 N->use_begin()->getOpcode() == ISD::SUB))
12661 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
12662 // and shift+add+shift.
12663 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
12665 unsigned ShiftAmt, AddSubOpc;
12666 // Is the shifted value the LHS operand of the add/sub?
12667 bool ShiftValUseIsN0 = true;
12668 // Do we need to negate the result?
12669 bool NegateResult = false;
12671 if (ConstValue.isNonNegative()) {
12672 // (mul x, 2^N + 1) => (add (shl x, N), x)
12673 // (mul x, 2^N - 1) => (sub (shl x, N), x)
12674 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
12675 APInt SCVMinus1 = ShiftedConstValue - 1;
12676 APInt CVPlus1 = ConstValue + 1;
12677 if (SCVMinus1.isPowerOf2()) {
12678 ShiftAmt = SCVMinus1.logBase2();
12679 AddSubOpc = ISD::ADD;
12680 } else if (CVPlus1.isPowerOf2()) {
12681 ShiftAmt = CVPlus1.logBase2();
12682 AddSubOpc = ISD::SUB;
12686 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
12687 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
12688 APInt CVNegPlus1 = -ConstValue + 1;
12689 APInt CVNegMinus1 = -ConstValue - 1;
12690 if (CVNegPlus1.isPowerOf2()) {
12691 ShiftAmt = CVNegPlus1.logBase2();
12692 AddSubOpc = ISD::SUB;
12693 ShiftValUseIsN0 = false;
12694 } else if (CVNegMinus1.isPowerOf2()) {
12695 ShiftAmt = CVNegMinus1.logBase2();
12696 AddSubOpc = ISD::ADD;
12697 NegateResult = true;
12703 EVT VT = N->getValueType(0);
12704 SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
12705 DAG.getConstant(ShiftAmt, DL, MVT::i64));
12707 SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
12708 SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
12709 SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
12710 assert(!(NegateResult && TrailingZeroes) &&
12711 "NegateResult and TrailingZeroes cannot both be true for now.");
12712 // Negate the result.
12714 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
12715 // Shift the result.
12716 if (TrailingZeroes)
12717 return DAG.getNode(ISD::SHL, DL, VT, Res,
12718 DAG.getConstant(TrailingZeroes, DL, MVT::i64));
12722 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
12723 SelectionDAG &DAG) {
12724 // Take advantage of vector comparisons producing 0 or -1 in each lane to
12725 // optimize away operation when it's from a constant.
12727 // The general transformation is:
12728 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
12729 // AND(VECTOR_CMP(x,y), constant2)
12730 // constant2 = UNARYOP(constant)
12732 // Early exit if this isn't a vector operation, the operand of the
12733 // unary operation isn't a bitwise AND, or if the sizes of the operations
12734 // aren't the same.
12735 EVT VT = N->getValueType(0);
12736 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
12737 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
12738 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
12741 // Now check that the other operand of the AND is a constant. We could
12742 // make the transformation for non-constant splats as well, but it's unclear
12743 // that would be a benefit as it would not eliminate any operations, just
12744 // perform one more step in scalar code before moving to the vector unit.
12745 if (BuildVectorSDNode *BV =
12746 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
12747 // Bail out if the vector isn't a constant.
12748 if (!BV->isConstant())
12751 // Everything checks out. Build up the new and improved node.
12753 EVT IntVT = BV->getValueType(0);
12754 // Create a new constant of the appropriate type for the transformed
12756 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
12757 // The AND node needs bitcasts to/from an integer vector type around it.
12758 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
12759 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
12760 N->getOperand(0)->getOperand(0), MaskConst);
12761 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
12768 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
12769 const AArch64Subtarget *Subtarget) {
12770 // First try to optimize away the conversion when it's conditionally from
12771 // a constant. Vectors only.
12772 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
12775 EVT VT = N->getValueType(0);
12776 if (VT != MVT::f32 && VT != MVT::f64)
12779 // Only optimize when the source and destination types have the same width.
12780 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
12783 // If the result of an integer load is only used by an integer-to-float
12784 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
12785 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
12786 SDValue N0 = N->getOperand(0);
12787 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
12788 // Do not change the width of a volatile load.
12789 !cast<LoadSDNode>(N0)->isVolatile()) {
12790 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
12791 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
12792 LN0->getPointerInfo(), LN0->getAlignment(),
12793 LN0->getMemOperand()->getFlags());
12795 // Make sure successors of the original load stay after it by updating them
12796 // to use the new Chain.
12797 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
12800 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
12801 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
12807 /// Fold a floating-point multiply by power of two into floating-point to
12808 /// fixed-point conversion.
12809 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
12810 TargetLowering::DAGCombinerInfo &DCI,
12811 const AArch64Subtarget *Subtarget) {
12812 if (!Subtarget->hasNEON())
12815 if (!N->getValueType(0).isSimple())
12818 SDValue Op = N->getOperand(0);
12819 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12820 Op.getOpcode() != ISD::FMUL)
12823 SDValue ConstVec = Op->getOperand(1);
12824 if (!isa<BuildVectorSDNode>(ConstVec))
12827 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
12828 uint32_t FloatBits = FloatTy.getSizeInBits();
12829 if (FloatBits != 32 && FloatBits != 64)
12832 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
12833 uint32_t IntBits = IntTy.getSizeInBits();
12834 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12837 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
12838 if (IntBits > FloatBits)
12841 BitVector UndefElements;
12842 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
12843 int32_t Bits = IntBits == 64 ? 64 : 32;
12844 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
12845 if (C == -1 || C == 0 || C > Bits)
12849 unsigned NumLanes = Op.getValueType().getVectorNumElements();
12850 switch (NumLanes) {
12854 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
12857 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
12861 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12864 assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
12865 "Illegal vector type after legalization");
12868 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
12869 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
12870 : Intrinsic::aarch64_neon_vcvtfp2fxu;
12872 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
12873 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
12874 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
12875 // We can handle smaller integers by generating an extra trunc.
12876 if (IntBits < FloatBits)
12877 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
12882 /// Fold a floating-point divide by power of two into fixed-point to
12883 /// floating-point conversion.
12884 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
12885 TargetLowering::DAGCombinerInfo &DCI,
12886 const AArch64Subtarget *Subtarget) {
12887 if (!Subtarget->hasNEON())
12890 SDValue Op = N->getOperand(0);
12891 unsigned Opc = Op->getOpcode();
12892 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12893 !Op.getOperand(0).getValueType().isSimple() ||
12894 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
12897 SDValue ConstVec = N->getOperand(1);
12898 if (!isa<BuildVectorSDNode>(ConstVec))
12901 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
12902 int32_t IntBits = IntTy.getSizeInBits();
12903 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12906 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
12907 int32_t FloatBits = FloatTy.getSizeInBits();
12908 if (FloatBits != 32 && FloatBits != 64)
12911 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
12912 if (IntBits > FloatBits)
12915 BitVector UndefElements;
12916 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
12917 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
12918 if (C == -1 || C == 0 || C > FloatBits)
12922 unsigned NumLanes = Op.getValueType().getVectorNumElements();
12923 switch (NumLanes) {
12927 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
12930 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
12934 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12938 SDValue ConvInput = Op.getOperand(0);
12939 bool IsSigned = Opc == ISD::SINT_TO_FP;
12940 if (IntBits < FloatBits)
12941 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
12944 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
12945 : Intrinsic::aarch64_neon_vcvtfxu2fp;
12946 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
12947 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
12948 DAG.getConstant(C, DL, MVT::i32));
12951 /// An EXTR instruction is made up of two shifts, ORed together. This helper
12952 /// searches for and classifies those shifts.
12953 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
12955 if (N.getOpcode() == ISD::SHL)
12957 else if (N.getOpcode() == ISD::SRL)
12962 if (!isa<ConstantSDNode>(N.getOperand(1)))
12965 ShiftAmount = N->getConstantOperandVal(1);
12966 Src = N->getOperand(0);
12970 /// EXTR instruction extracts a contiguous chunk of bits from two existing
12971 /// registers viewed as a high/low pair. This function looks for the pattern:
12972 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
12973 /// with an EXTR. Can't quite be done in TableGen because the two immediates
12974 /// aren't independent.
12975 static SDValue tryCombineToEXTR(SDNode *N,
12976 TargetLowering::DAGCombinerInfo &DCI) {
12977 SelectionDAG &DAG = DCI.DAG;
12979 EVT VT = N->getValueType(0);
12981 assert(N->getOpcode() == ISD::OR && "Unexpected root");
12983 if (VT != MVT::i32 && VT != MVT::i64)
12987 uint32_t ShiftLHS = 0;
12988 bool LHSFromHi = false;
12989 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
12993 uint32_t ShiftRHS = 0;
12994 bool RHSFromHi = false;
12995 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
12998 // If they're both trying to come from the high part of the register, they're
12999 // not really an EXTR.
13000 if (LHSFromHi == RHSFromHi)
13003 if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
13007 std::swap(LHS, RHS);
13008 std::swap(ShiftLHS, ShiftRHS);
13011 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
13012 DAG.getConstant(ShiftRHS, DL, MVT::i64));
13015 static SDValue tryCombineToBSL(SDNode *N,
13016 TargetLowering::DAGCombinerInfo &DCI) {
13017 EVT VT = N->getValueType(0);
13018 SelectionDAG &DAG = DCI.DAG;
13021 if (!VT.isVector())
13024 // The combining code currently only works for NEON vectors. In particular,
13025 // it does not work for SVE when dealing with vectors wider than 128 bits.
13026 if (!VT.is64BitVector() && !VT.is128BitVector())
13029 SDValue N0 = N->getOperand(0);
13030 if (N0.getOpcode() != ISD::AND)
13033 SDValue N1 = N->getOperand(1);
13034 if (N1.getOpcode() != ISD::AND)
13037 // InstCombine does (not (neg a)) => (add a -1).
13038 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
13039 // Loop over all combinations of AND operands.
13040 for (int i = 1; i >= 0; --i) {
13041 for (int j = 1; j >= 0; --j) {
13042 SDValue O0 = N0->getOperand(i);
13043 SDValue O1 = N1->getOperand(j);
13044 SDValue Sub, Add, SubSibling, AddSibling;
13046 // Find a SUB and an ADD operand, one from each AND.
13047 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
13050 SubSibling = N0->getOperand(1 - i);
13051 AddSibling = N1->getOperand(1 - j);
13052 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
13055 AddSibling = N0->getOperand(1 - i);
13056 SubSibling = N1->getOperand(1 - j);
13060 if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
13063 // Constant ones is always righthand operand of the Add.
13064 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
13067 if (Sub.getOperand(1) != Add.getOperand(0))
13070 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
13074 // (or (and a b) (and (not a) c)) => (bsl a b c)
13075 // We only have to look for constant vectors here since the general, variable
13076 // case can be handled in TableGen.
13077 unsigned Bits = VT.getScalarSizeInBits();
13078 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
13079 for (int i = 1; i >= 0; --i)
13080 for (int j = 1; j >= 0; --j) {
13081 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
13082 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
13083 if (!BVN0 || !BVN1)
13086 bool FoundMatch = true;
13087 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
13088 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
13089 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
13090 if (!CN0 || !CN1 ||
13091 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
13092 FoundMatch = false;
13098 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
13099 N0->getOperand(1 - i), N1->getOperand(1 - j));
13105 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
13106 const AArch64Subtarget *Subtarget) {
13107 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
13108 SelectionDAG &DAG = DCI.DAG;
13109 EVT VT = N->getValueType(0);
13111 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
13114 if (SDValue Res = tryCombineToEXTR(N, DCI))
13117 if (SDValue Res = tryCombineToBSL(N, DCI))
13123 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
13124 if (!MemVT.getVectorElementType().isSimple())
13127 uint64_t MaskForTy = 0ull;
13128 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
13130 MaskForTy = 0xffull;
13133 MaskForTy = 0xffffull;
13136 MaskForTy = 0xffffffffull;
13143 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
13144 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
13145 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
13150 static SDValue performSVEAndCombine(SDNode *N,
13151 TargetLowering::DAGCombinerInfo &DCI) {
13152 if (DCI.isBeforeLegalizeOps())
13155 SelectionDAG &DAG = DCI.DAG;
13156 SDValue Src = N->getOperand(0);
13157 unsigned Opc = Src->getOpcode();
13159 // Zero/any extend of an unsigned unpack
13160 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
13161 SDValue UnpkOp = Src->getOperand(0);
13162 SDValue Dup = N->getOperand(1);
13164 if (Dup.getOpcode() != AArch64ISD::DUP)
13168 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
13169 uint64_t ExtVal = C->getZExtValue();
13171 // If the mask is fully covered by the unpack, we don't need to push
13172 // a new AND onto the operand
13173 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
13174 if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
13175 (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
13176 (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
13179 // Truncate to prevent a DUP with an over wide constant
13180 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
13182 // Otherwise, make sure we propagate the AND to the operand
13184 Dup = DAG.getNode(AArch64ISD::DUP, DL,
13185 UnpkOp->getValueType(0),
13186 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
13188 SDValue And = DAG.getNode(ISD::AND, DL,
13189 UnpkOp->getValueType(0), UnpkOp, Dup);
13191 return DAG.getNode(Opc, DL, N->getValueType(0), And);
13194 if (!EnableCombineMGatherIntrinsics)
13197 SDValue Mask = N->getOperand(1);
13199 if (!Src.hasOneUse())
13204 // SVE load instructions perform an implicit zero-extend, which makes them
13205 // perfect candidates for combining.
13207 case AArch64ISD::LD1_MERGE_ZERO:
13208 case AArch64ISD::LDNF1_MERGE_ZERO:
13209 case AArch64ISD::LDFF1_MERGE_ZERO:
13210 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
13212 case AArch64ISD::GLD1_MERGE_ZERO:
13213 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
13214 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
13215 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
13216 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
13217 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
13218 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
13219 case AArch64ISD::GLDFF1_MERGE_ZERO:
13220 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
13221 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
13222 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
13223 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
13224 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
13225 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
13226 case AArch64ISD::GLDNT1_MERGE_ZERO:
13227 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
13233 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
13239 static SDValue performANDCombine(SDNode *N,
13240 TargetLowering::DAGCombinerInfo &DCI) {
13241 SelectionDAG &DAG = DCI.DAG;
13242 SDValue LHS = N->getOperand(0);
13243 EVT VT = N->getValueType(0);
13244 if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
13247 if (VT.isScalableVector())
13248 return performSVEAndCombine(N, DCI);
13250 // The combining code below works only for NEON vectors. In particular, it
13251 // does not work for SVE when dealing with vectors wider than 128 bits.
13252 if (!(VT.is64BitVector() || VT.is128BitVector()))
13255 BuildVectorSDNode *BVN =
13256 dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
13260 // AND does not accept an immediate, so check if we can use a BIC immediate
13261 // instruction instead. We do this here instead of using a (and x, (mvni imm))
13262 // pattern in isel, because some immediates may be lowered to the preferred
13263 // (and x, (movi imm)) form, even though an mvni representation also exists.
13264 APInt DefBits(VT.getSizeInBits(), 0);
13265 APInt UndefBits(VT.getSizeInBits(), 0);
13266 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13269 DefBits = ~DefBits;
13270 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
13272 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
13276 UndefBits = ~UndefBits;
13277 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
13278 UndefBits, &LHS)) ||
13279 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
13287 static SDValue performSRLCombine(SDNode *N,
13288 TargetLowering::DAGCombinerInfo &DCI) {
13289 SelectionDAG &DAG = DCI.DAG;
13290 EVT VT = N->getValueType(0);
13291 if (VT != MVT::i32 && VT != MVT::i64)
13294 // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
13295 // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
13296 // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
13297 SDValue N0 = N->getOperand(0);
13298 if (N0.getOpcode() == ISD::BSWAP) {
13300 SDValue N1 = N->getOperand(1);
13301 SDValue N00 = N0.getOperand(0);
13302 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
13303 uint64_t ShiftAmt = C->getZExtValue();
13304 if (VT == MVT::i32 && ShiftAmt == 16 &&
13305 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
13306 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
13307 if (VT == MVT::i64 && ShiftAmt == 32 &&
13308 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
13309 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
13315 // Attempt to form urhadd(OpA, OpB) from
13316 // truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
13317 // or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
13318 // The original form of the first expression is
13319 // truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
13320 // (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
13321 // Before this function is called the srl will have been lowered to
13322 // AArch64ISD::VLSHR.
13323 // This pass can also recognize signed variants of the patterns that use sign
13324 // extension instead of zero extension and form a srhadd(OpA, OpB) or a
13325 // shadd(OpA, OpB) from them.
13327 performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
13328 SelectionDAG &DAG) {
13329 EVT VT = N->getValueType(0);
13331 // Since we are looking for a right shift by a constant value of 1 and we are
13332 // operating on types at least 16 bits in length (sign/zero extended OpA and
13333 // OpB, which are at least 8 bits), it follows that the truncate will always
13334 // discard the shifted-in bit and therefore the right shift will be logical
13335 // regardless of the signedness of OpA and OpB.
13336 SDValue Shift = N->getOperand(0);
13337 if (Shift.getOpcode() != AArch64ISD::VLSHR)
13340 // Is the right shift using an immediate value of 1?
13341 uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
13342 if (ShiftAmount != 1)
13345 SDValue ExtendOpA, ExtendOpB;
13346 SDValue ShiftOp0 = Shift.getOperand(0);
13347 unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
13348 if (ShiftOp0Opc == ISD::SUB) {
13350 SDValue Xor = ShiftOp0.getOperand(1);
13351 if (Xor.getOpcode() != ISD::XOR)
13354 // Is the XOR using a constant amount of all ones in the right hand side?
13356 if (!isAllConstantBuildVector(Xor.getOperand(1), C))
13359 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13360 APInt CAsAPInt(ElemSizeInBits, C);
13361 if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
13364 ExtendOpA = Xor.getOperand(0);
13365 ExtendOpB = ShiftOp0.getOperand(0);
13366 } else if (ShiftOp0Opc == ISD::ADD) {
13367 ExtendOpA = ShiftOp0.getOperand(0);
13368 ExtendOpB = ShiftOp0.getOperand(1);
13372 unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
13373 unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
13374 if (!(ExtendOpAOpc == ExtendOpBOpc &&
13375 (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
13378 // Is the result of the right shift being truncated to the same value type as
13379 // the original operands, OpA and OpB?
13380 SDValue OpA = ExtendOpA.getOperand(0);
13381 SDValue OpB = ExtendOpB.getOperand(0);
13382 EVT OpAVT = OpA.getValueType();
13383 assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
13384 if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
13388 bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
13389 bool IsRHADD = ShiftOp0Opc == ISD::SUB;
13390 unsigned HADDOpc = IsSignExtend
13391 ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
13392 : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD);
13393 SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
13398 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
13401 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
13403 return VT == MVT::i64;
13409 static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
13410 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
13411 ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
13413 EVT VT = N->getValueType(0);
13414 const bool FullFP16 =
13415 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
13417 // Rewrite for pairwise fadd pattern
13418 // (f32 (extract_vector_elt
13419 // (fadd (vXf32 Other)
13420 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
13422 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
13423 // (extract_vector_elt (vXf32 Other) 1))
13424 if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
13425 hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
13427 SDValue N00 = N0->getOperand(0);
13428 SDValue N01 = N0->getOperand(1);
13430 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
13431 SDValue Other = N00;
13433 // And handle the commutative case.
13435 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
13439 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
13440 Other == Shuffle->getOperand(0)) {
13441 return DAG.getNode(N0->getOpcode(), DL, VT,
13442 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
13443 DAG.getConstant(0, DL, MVT::i64)),
13444 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
13445 DAG.getConstant(1, DL, MVT::i64)));
13452 static SDValue performConcatVectorsCombine(SDNode *N,
13453 TargetLowering::DAGCombinerInfo &DCI,
13454 SelectionDAG &DAG) {
13456 EVT VT = N->getValueType(0);
13457 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
13458 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
13460 // Optimize concat_vectors of truncated vectors, where the intermediate
13461 // type is illegal, to avoid said illegality, e.g.,
13462 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
13463 // (v2i16 (truncate (v2i64)))))
13465 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
13466 // (v4i32 (bitcast (v2i64))),
13468 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
13469 // on both input and result type, so we might generate worse code.
13470 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
13471 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
13472 N1Opc == ISD::TRUNCATE) {
13473 SDValue N00 = N0->getOperand(0);
13474 SDValue N10 = N1->getOperand(0);
13475 EVT N00VT = N00.getValueType();
13477 if (N00VT == N10.getValueType() &&
13478 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
13479 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
13480 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
13481 SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
13482 for (size_t i = 0; i < Mask.size(); ++i)
13484 return DAG.getNode(ISD::TRUNCATE, dl, VT,
13485 DAG.getVectorShuffle(
13487 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
13488 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
13492 // Wait 'til after everything is legalized to try this. That way we have
13493 // legal vector types and such.
13494 if (DCI.isBeforeLegalizeOps())
13497 // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted
13498 // subvectors from the same original vectors. Combine these into a single
13499 // [us]rhadd or [us]hadd that operates on the two original vectors. Example:
13500 // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
13501 // extract_subvector (v16i8 OpB,
13503 // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
13504 // extract_subvector (v16i8 OpB,
13507 // (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
13508 if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
13509 (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD ||
13510 N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) {
13511 SDValue N00 = N0->getOperand(0);
13512 SDValue N01 = N0->getOperand(1);
13513 SDValue N10 = N1->getOperand(0);
13514 SDValue N11 = N1->getOperand(1);
13516 EVT N00VT = N00.getValueType();
13517 EVT N10VT = N10.getValueType();
13519 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13520 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13521 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13522 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
13523 SDValue N00Source = N00->getOperand(0);
13524 SDValue N01Source = N01->getOperand(0);
13525 SDValue N10Source = N10->getOperand(0);
13526 SDValue N11Source = N11->getOperand(0);
13528 if (N00Source == N10Source && N01Source == N11Source &&
13529 N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
13530 assert(N0.getValueType() == N1.getValueType());
13532 uint64_t N00Index = N00.getConstantOperandVal(1);
13533 uint64_t N01Index = N01.getConstantOperandVal(1);
13534 uint64_t N10Index = N10.getConstantOperandVal(1);
13535 uint64_t N11Index = N11.getConstantOperandVal(1);
13537 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
13538 N10Index == N00VT.getVectorNumElements())
13539 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
13544 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
13545 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
13546 // canonicalise to that.
13547 if (N0 == N1 && VT.getVectorNumElements() == 2) {
13548 assert(VT.getScalarSizeInBits() == 64);
13549 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
13550 DAG.getConstant(0, dl, MVT::i64));
13553 // Canonicalise concat_vectors so that the right-hand vector has as few
13554 // bit-casts as possible before its real operation. The primary matching
13555 // destination for these operations will be the narrowing "2" instructions,
13556 // which depend on the operation being performed on this right-hand vector.
13558 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
13560 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
13562 if (N1Opc != ISD::BITCAST)
13564 SDValue RHS = N1->getOperand(0);
13565 MVT RHSTy = RHS.getValueType().getSimpleVT();
13566 // If the RHS is not a vector, this is not the pattern we're looking for.
13567 if (!RHSTy.isVector())
13571 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
13573 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
13574 RHSTy.getVectorNumElements() * 2);
13575 return DAG.getNode(ISD::BITCAST, dl, VT,
13576 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
13577 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
13581 static SDValue tryCombineFixedPointConvert(SDNode *N,
13582 TargetLowering::DAGCombinerInfo &DCI,
13583 SelectionDAG &DAG) {
13584 // Wait until after everything is legalized to try this. That way we have
13585 // legal vector types and such.
13586 if (DCI.isBeforeLegalizeOps())
13588 // Transform a scalar conversion of a value from a lane extract into a
13589 // lane extract of a vector conversion. E.g., from foo1 to foo2:
13590 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
13591 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
13593 // The second form interacts better with instruction selection and the
13594 // register allocator to avoid cross-class register copies that aren't
13595 // coalescable due to a lane reference.
13597 // Check the operand and see if it originates from a lane extract.
13598 SDValue Op1 = N->getOperand(1);
13599 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13600 // Yep, no additional predication needed. Perform the transform.
13601 SDValue IID = N->getOperand(0);
13602 SDValue Shift = N->getOperand(2);
13603 SDValue Vec = Op1.getOperand(0);
13604 SDValue Lane = Op1.getOperand(1);
13605 EVT ResTy = N->getValueType(0);
13609 // The vector width should be 128 bits by the time we get here, even
13610 // if it started as 64 bits (the extract_vector handling will have
13612 assert(Vec.getValueSizeInBits() == 128 &&
13613 "unexpected vector size on extract_vector_elt!");
13614 if (Vec.getValueType() == MVT::v4i32)
13615 VecResTy = MVT::v4f32;
13616 else if (Vec.getValueType() == MVT::v2i64)
13617 VecResTy = MVT::v2f64;
13619 llvm_unreachable("unexpected vector type!");
13622 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
13623 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
13628 // AArch64 high-vector "long" operations are formed by performing the non-high
13629 // version on an extract_subvector of each operand which gets the high half:
13631 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
13633 // However, there are cases which don't have an extract_high explicitly, but
13634 // have another operation that can be made compatible with one for free. For
13637 // (dupv64 scalar) --> (extract_high (dup128 scalar))
13639 // This routine does the actual conversion of such DUPs, once outer routines
13640 // have determined that everything else is in order.
13641 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
13643 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
13644 switch (N.getOpcode()) {
13645 case AArch64ISD::DUP:
13646 case AArch64ISD::DUPLANE8:
13647 case AArch64ISD::DUPLANE16:
13648 case AArch64ISD::DUPLANE32:
13649 case AArch64ISD::DUPLANE64:
13650 case AArch64ISD::MOVI:
13651 case AArch64ISD::MOVIshift:
13652 case AArch64ISD::MOVIedit:
13653 case AArch64ISD::MOVImsl:
13654 case AArch64ISD::MVNIshift:
13655 case AArch64ISD::MVNImsl:
13658 // FMOV could be supported, but isn't very useful, as it would only occur
13659 // if you passed a bitcast' floating point immediate to an eligible long
13660 // integer op (addl, smull, ...).
13664 MVT NarrowTy = N.getSimpleValueType();
13665 if (!NarrowTy.is64BitVector())
13668 MVT ElementTy = NarrowTy.getVectorElementType();
13669 unsigned NumElems = NarrowTy.getVectorNumElements();
13670 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
13673 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
13674 DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
13675 DAG.getConstant(NumElems, dl, MVT::i64));
13678 static bool isEssentiallyExtractHighSubvector(SDValue N) {
13679 if (N.getOpcode() == ISD::BITCAST)
13680 N = N.getOperand(0);
13681 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
13683 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
13684 N.getOperand(0).getValueType().getVectorNumElements() / 2;
13687 /// Helper structure to keep track of ISD::SET_CC operands.
13688 struct GenericSetCCInfo {
13689 const SDValue *Opnd0;
13690 const SDValue *Opnd1;
13694 /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
13695 struct AArch64SetCCInfo {
13696 const SDValue *Cmp;
13697 AArch64CC::CondCode CC;
13700 /// Helper structure to keep track of SetCC information.
13702 GenericSetCCInfo Generic;
13703 AArch64SetCCInfo AArch64;
13706 /// Helper structure to be able to read SetCC information. If set to
13707 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
13708 /// GenericSetCCInfo.
13709 struct SetCCInfoAndKind {
13714 /// Check whether or not \p Op is a SET_CC operation, either a generic or
13716 /// AArch64 lowered one.
13717 /// \p SetCCInfo is filled accordingly.
13718 /// \post SetCCInfo is meanginfull only when this function returns true.
13719 /// \return True when Op is a kind of SET_CC operation.
13720 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
13721 // If this is a setcc, this is straight forward.
13722 if (Op.getOpcode() == ISD::SETCC) {
13723 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
13724 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
13725 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
13726 SetCCInfo.IsAArch64 = false;
13729 // Otherwise, check if this is a matching csel instruction.
13732 // - csel 0, 1, !cc
13733 if (Op.getOpcode() != AArch64ISD::CSEL)
13735 // Set the information about the operands.
13736 // TODO: we want the operands of the Cmp not the csel
13737 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
13738 SetCCInfo.IsAArch64 = true;
13739 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
13740 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
13742 // Check that the operands matches the constraints:
13743 // (1) Both operands must be constants.
13744 // (2) One must be 1 and the other must be 0.
13745 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
13746 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13749 if (!TValue || !FValue)
13753 if (!TValue->isOne()) {
13754 // Update the comparison when we are interested in !cc.
13755 std::swap(TValue, FValue);
13756 SetCCInfo.Info.AArch64.CC =
13757 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
13759 return TValue->isOne() && FValue->isNullValue();
13762 // Returns true if Op is setcc or zext of setcc.
13763 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
13764 if (isSetCC(Op, Info))
13766 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
13767 isSetCC(Op->getOperand(0), Info));
13770 // The folding we want to perform is:
13771 // (add x, [zext] (setcc cc ...) )
13773 // (csel x, (add x, 1), !cc ...)
13775 // The latter will get matched to a CSINC instruction.
13776 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
13777 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
13778 SDValue LHS = Op->getOperand(0);
13779 SDValue RHS = Op->getOperand(1);
13780 SetCCInfoAndKind InfoAndKind;
13782 // If both operands are a SET_CC, then we don't want to perform this
13783 // folding and create another csel as this results in more instructions
13784 // (and higher register usage).
13785 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
13786 isSetCCOrZExtSetCC(RHS, InfoAndKind))
13789 // If neither operand is a SET_CC, give up.
13790 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
13791 std::swap(LHS, RHS);
13792 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
13796 // FIXME: This could be generatized to work for FP comparisons.
13797 EVT CmpVT = InfoAndKind.IsAArch64
13798 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
13799 : InfoAndKind.Info.Generic.Opnd0->getValueType();
13800 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
13806 if (InfoAndKind.IsAArch64) {
13807 CCVal = DAG.getConstant(
13808 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
13810 Cmp = *InfoAndKind.Info.AArch64.Cmp;
13812 Cmp = getAArch64Cmp(
13813 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
13814 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
13817 EVT VT = Op->getValueType(0);
13818 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
13819 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
13822 // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
13823 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
13824 EVT VT = N->getValueType(0);
13825 // Only scalar integer and vector types.
13826 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
13829 SDValue LHS = N->getOperand(0);
13830 SDValue RHS = N->getOperand(1);
13831 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13832 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
13835 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13836 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
13837 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue())
13840 SDValue Op1 = LHS->getOperand(0);
13841 SDValue Op2 = RHS->getOperand(0);
13842 EVT OpVT1 = Op1.getValueType();
13843 EVT OpVT2 = Op2.getValueType();
13844 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
13845 Op2.getOpcode() != AArch64ISD::UADDV ||
13846 OpVT1.getVectorElementType() != VT)
13849 SDValue Val1 = Op1.getOperand(0);
13850 SDValue Val2 = Op2.getOperand(0);
13851 EVT ValVT = Val1->getValueType(0);
13853 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
13854 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
13855 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
13856 DAG.getConstant(0, DL, MVT::i64));
13859 // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
13860 static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
13861 EVT VT = N->getValueType(0);
13862 if (N->getOpcode() != ISD::ADD)
13865 SDValue Dot = N->getOperand(0);
13866 SDValue A = N->getOperand(1);
13867 // Handle commutivity
13868 auto isZeroDot = [](SDValue Dot) {
13869 return (Dot.getOpcode() == AArch64ISD::UDOT ||
13870 Dot.getOpcode() == AArch64ISD::SDOT) &&
13871 isZerosVector(Dot.getOperand(0).getNode());
13873 if (!isZeroDot(Dot))
13875 if (!isZeroDot(Dot))
13878 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
13879 Dot.getOperand(2));
13882 // The basic add/sub long vector instructions have variants with "2" on the end
13883 // which act on the high-half of their inputs. They are normally matched by
13886 // (add (zeroext (extract_high LHS)),
13887 // (zeroext (extract_high RHS)))
13888 // -> uaddl2 vD, vN, vM
13890 // However, if one of the extracts is something like a duplicate, this
13891 // instruction can still be used profitably. This function puts the DAG into a
13892 // more appropriate form for those patterns to trigger.
13893 static SDValue performAddSubLongCombine(SDNode *N,
13894 TargetLowering::DAGCombinerInfo &DCI,
13895 SelectionDAG &DAG) {
13896 if (DCI.isBeforeLegalizeOps())
13899 MVT VT = N->getSimpleValueType(0);
13900 if (!VT.is128BitVector()) {
13901 if (N->getOpcode() == ISD::ADD)
13902 return performSetccAddFolding(N, DAG);
13906 // Make sure both branches are extended in the same way.
13907 SDValue LHS = N->getOperand(0);
13908 SDValue RHS = N->getOperand(1);
13909 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
13910 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
13911 LHS.getOpcode() != RHS.getOpcode())
13914 unsigned ExtType = LHS.getOpcode();
13916 // It's not worth doing if at least one of the inputs isn't already an
13917 // extract, but we don't know which it'll be so we have to try both.
13918 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
13919 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
13920 if (!RHS.getNode())
13923 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
13924 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
13925 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
13926 if (!LHS.getNode())
13929 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
13932 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
13935 static SDValue performAddSubCombine(SDNode *N,
13936 TargetLowering::DAGCombinerInfo &DCI,
13937 SelectionDAG &DAG) {
13938 // Try to change sum of two reductions.
13939 if (SDValue Val = performUADDVCombine(N, DAG))
13941 if (SDValue Val = performAddDotCombine(N, DAG))
13944 return performAddSubLongCombine(N, DCI, DAG);
13947 // Massage DAGs which we can use the high-half "long" operations on into
13948 // something isel will recognize better. E.g.
13950 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
13951 // (aarch64_neon_umull (extract_high (v2i64 vec)))
13952 // (extract_high (v2i64 (dup128 scalar)))))
13954 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
13955 TargetLowering::DAGCombinerInfo &DCI,
13956 SelectionDAG &DAG) {
13957 if (DCI.isBeforeLegalizeOps())
13960 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
13961 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
13962 assert(LHS.getValueType().is64BitVector() &&
13963 RHS.getValueType().is64BitVector() &&
13964 "unexpected shape for long operation");
13966 // Either node could be a DUP, but it's not worth doing both of them (you'd
13967 // just as well use the non-high version) so look for a corresponding extract
13968 // operation on the other "wing".
13969 if (isEssentiallyExtractHighSubvector(LHS)) {
13970 RHS = tryExtendDUPToExtractHigh(RHS, DAG);
13971 if (!RHS.getNode())
13973 } else if (isEssentiallyExtractHighSubvector(RHS)) {
13974 LHS = tryExtendDUPToExtractHigh(LHS, DAG);
13975 if (!LHS.getNode())
13979 if (IID == Intrinsic::not_intrinsic)
13980 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
13982 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
13983 N->getOperand(0), LHS, RHS);
13986 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
13987 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
13988 unsigned ElemBits = ElemTy.getSizeInBits();
13990 int64_t ShiftAmount;
13991 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
13992 APInt SplatValue, SplatUndef;
13993 unsigned SplatBitSize;
13995 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
13996 HasAnyUndefs, ElemBits) ||
13997 SplatBitSize != ElemBits)
14000 ShiftAmount = SplatValue.getSExtValue();
14001 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
14002 ShiftAmount = CVN->getSExtValue();
14010 llvm_unreachable("Unknown shift intrinsic");
14011 case Intrinsic::aarch64_neon_sqshl:
14012 Opcode = AArch64ISD::SQSHL_I;
14013 IsRightShift = false;
14015 case Intrinsic::aarch64_neon_uqshl:
14016 Opcode = AArch64ISD::UQSHL_I;
14017 IsRightShift = false;
14019 case Intrinsic::aarch64_neon_srshl:
14020 Opcode = AArch64ISD::SRSHR_I;
14021 IsRightShift = true;
14023 case Intrinsic::aarch64_neon_urshl:
14024 Opcode = AArch64ISD::URSHR_I;
14025 IsRightShift = true;
14027 case Intrinsic::aarch64_neon_sqshlu:
14028 Opcode = AArch64ISD::SQSHLU_I;
14029 IsRightShift = false;
14031 case Intrinsic::aarch64_neon_sshl:
14032 case Intrinsic::aarch64_neon_ushl:
14033 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
14034 // left shift for positive shift amounts. Below, we only replace the current
14035 // node with VSHL, if this condition is met.
14036 Opcode = AArch64ISD::VSHL;
14037 IsRightShift = false;
14041 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
14043 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
14044 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
14045 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
14047 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
14048 DAG.getConstant(ShiftAmount, dl, MVT::i32));
14054 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
14055 // the intrinsics must be legal and take an i32, this means there's almost
14056 // certainly going to be a zext in the DAG which we can eliminate.
14057 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
14058 SDValue AndN = N->getOperand(2);
14059 if (AndN.getOpcode() != ISD::AND)
14062 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
14063 if (!CMask || CMask->getZExtValue() != Mask)
14066 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
14067 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
14070 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
14071 SelectionDAG &DAG) {
14073 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
14074 DAG.getNode(Opc, dl,
14075 N->getOperand(1).getSimpleValueType(),
14077 DAG.getConstant(0, dl, MVT::i64));
14080 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
14082 SDValue Op1 = N->getOperand(1);
14083 SDValue Op2 = N->getOperand(2);
14084 EVT ScalarTy = Op2.getValueType();
14085 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
14086 ScalarTy = MVT::i32;
14088 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
14089 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
14090 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
14091 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
14092 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
14093 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
14096 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
14098 SDValue Scalar = N->getOperand(3);
14099 EVT ScalarTy = Scalar.getValueType();
14101 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
14102 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
14104 SDValue Passthru = N->getOperand(1);
14105 SDValue Pred = N->getOperand(2);
14106 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
14107 Pred, Scalar, Passthru);
14110 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
14112 LLVMContext &Ctx = *DAG.getContext();
14113 EVT VT = N->getValueType(0);
14115 assert(VT.isScalableVector() && "Expected a scalable vector.");
14117 // Current lowering only supports the SVE-ACLE types.
14118 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
14121 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
14122 unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
14124 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
14126 // Convert everything to the domain of EXT (i.e bytes).
14127 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
14128 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
14129 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
14130 DAG.getConstant(ElemSize, dl, MVT::i32));
14132 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
14133 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
14136 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
14137 TargetLowering::DAGCombinerInfo &DCI,
14138 SelectionDAG &DAG) {
14139 if (DCI.isBeforeLegalize())
14142 SDValue Comparator = N->getOperand(3);
14143 if (Comparator.getOpcode() == AArch64ISD::DUP ||
14144 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
14145 unsigned IID = getIntrinsicID(N);
14146 EVT VT = N->getValueType(0);
14147 EVT CmpVT = N->getOperand(2).getValueType();
14148 SDValue Pred = N->getOperand(1);
14154 llvm_unreachable("Called with wrong intrinsic!");
14157 // Signed comparisons
14158 case Intrinsic::aarch64_sve_cmpeq_wide:
14159 case Intrinsic::aarch64_sve_cmpne_wide:
14160 case Intrinsic::aarch64_sve_cmpge_wide:
14161 case Intrinsic::aarch64_sve_cmpgt_wide:
14162 case Intrinsic::aarch64_sve_cmplt_wide:
14163 case Intrinsic::aarch64_sve_cmple_wide: {
14164 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
14165 int64_t ImmVal = CN->getSExtValue();
14166 if (ImmVal >= -16 && ImmVal <= 15)
14167 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
14173 // Unsigned comparisons
14174 case Intrinsic::aarch64_sve_cmphs_wide:
14175 case Intrinsic::aarch64_sve_cmphi_wide:
14176 case Intrinsic::aarch64_sve_cmplo_wide:
14177 case Intrinsic::aarch64_sve_cmpls_wide: {
14178 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
14179 uint64_t ImmVal = CN->getZExtValue();
14181 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
14192 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
14193 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
14194 N->getOperand(2), Splat, DAG.getCondCode(CC));
14200 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
14201 AArch64CC::CondCode Cond) {
14202 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14205 assert(Op.getValueType().isScalableVector() &&
14206 TLI.isTypeLegal(Op.getValueType()) &&
14207 "Expected legal scalable vector type!");
14209 // Ensure target specific opcodes are using legal type.
14210 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
14211 SDValue TVal = DAG.getConstant(1, DL, OutVT);
14212 SDValue FVal = DAG.getConstant(0, DL, OutVT);
14214 // Set condition code (CC) flags.
14215 SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);
14217 // Convert CC to integer based on requested condition.
14218 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
14219 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
14220 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
14221 return DAG.getZExtOrTrunc(Res, DL, VT);
14224 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
14225 SelectionDAG &DAG) {
14228 SDValue Pred = N->getOperand(1);
14229 SDValue VecToReduce = N->getOperand(2);
14231 // NOTE: The integer reduction's result type is not always linked to the
14232 // operand's element type so we construct it from the intrinsic's result type.
14233 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
14234 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
14236 // SVE reductions set the whole vector register with the first element
14237 // containing the reduction result, which we'll now extract.
14238 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14239 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
14243 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
14244 SelectionDAG &DAG) {
14247 SDValue Pred = N->getOperand(1);
14248 SDValue VecToReduce = N->getOperand(2);
14250 EVT ReduceVT = VecToReduce.getValueType();
14251 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
14253 // SVE reductions set the whole vector register with the first element
14254 // containing the reduction result, which we'll now extract.
14255 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14256 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
14260 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
14261 SelectionDAG &DAG) {
14264 SDValue Pred = N->getOperand(1);
14265 SDValue InitVal = N->getOperand(2);
14266 SDValue VecToReduce = N->getOperand(3);
14267 EVT ReduceVT = VecToReduce.getValueType();
14269 // Ordered reductions use the first lane of the result vector as the
14270 // reduction's initial value.
14271 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14272 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
14273 DAG.getUNDEF(ReduceVT), InitVal, Zero);
14275 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
14277 // SVE reductions set the whole vector register with the first element
14278 // containing the reduction result, which we'll now extract.
14279 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
14283 static bool isAllActivePredicate(SDValue N) {
14284 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14286 // Look through cast.
14287 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14288 N = N.getOperand(0);
14289 // When reinterpreting from a type with fewer elements the "new" elements
14290 // are not active, so bail if they're likely to be used.
14291 if (N.getValueType().getVectorMinNumElements() < NumElts)
14295 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14296 // or smaller than the implicit element type represented by N.
14297 // NOTE: A larger element count implies a smaller element type.
14298 if (N.getOpcode() == AArch64ISD::PTRUE &&
14299 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14300 return N.getValueType().getVectorMinNumElements() >= NumElts;
14305 // If a merged operation has no inactive lanes we can relax it to a predicated
14306 // or unpredicated operation, which potentially allows better isel (perhaps
14307 // using immediate forms) or relaxing register reuse requirements.
14308 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
14310 bool UnpredOp = false) {
14311 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
14312 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
14313 SDValue Pg = N->getOperand(1);
14315 // ISD way to specify an all active predicate.
14316 if (isAllActivePredicate(Pg)) {
14318 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), N->getOperand(2),
14321 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg,
14322 N->getOperand(2), N->getOperand(3));
14325 // FUTURE: SplatVector(true)
14329 static SDValue performIntrinsicCombine(SDNode *N,
14330 TargetLowering::DAGCombinerInfo &DCI,
14331 const AArch64Subtarget *Subtarget) {
14332 SelectionDAG &DAG = DCI.DAG;
14333 unsigned IID = getIntrinsicID(N);
14337 case Intrinsic::aarch64_neon_vcvtfxs2fp:
14338 case Intrinsic::aarch64_neon_vcvtfxu2fp:
14339 return tryCombineFixedPointConvert(N, DCI, DAG);
14340 case Intrinsic::aarch64_neon_saddv:
14341 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
14342 case Intrinsic::aarch64_neon_uaddv:
14343 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
14344 case Intrinsic::aarch64_neon_sminv:
14345 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
14346 case Intrinsic::aarch64_neon_uminv:
14347 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
14348 case Intrinsic::aarch64_neon_smaxv:
14349 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
14350 case Intrinsic::aarch64_neon_umaxv:
14351 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
14352 case Intrinsic::aarch64_neon_fmax:
14353 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
14354 N->getOperand(1), N->getOperand(2));
14355 case Intrinsic::aarch64_neon_fmin:
14356 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
14357 N->getOperand(1), N->getOperand(2));
14358 case Intrinsic::aarch64_neon_fmaxnm:
14359 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
14360 N->getOperand(1), N->getOperand(2));
14361 case Intrinsic::aarch64_neon_fminnm:
14362 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
14363 N->getOperand(1), N->getOperand(2));
14364 case Intrinsic::aarch64_neon_smull:
14365 case Intrinsic::aarch64_neon_umull:
14366 case Intrinsic::aarch64_neon_pmull:
14367 case Intrinsic::aarch64_neon_sqdmull:
14368 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
14369 case Intrinsic::aarch64_neon_sqshl:
14370 case Intrinsic::aarch64_neon_uqshl:
14371 case Intrinsic::aarch64_neon_sqshlu:
14372 case Intrinsic::aarch64_neon_srshl:
14373 case Intrinsic::aarch64_neon_urshl:
14374 case Intrinsic::aarch64_neon_sshl:
14375 case Intrinsic::aarch64_neon_ushl:
14376 return tryCombineShiftImm(IID, N, DAG);
14377 case Intrinsic::aarch64_crc32b:
14378 case Intrinsic::aarch64_crc32cb:
14379 return tryCombineCRC32(0xff, N, DAG);
14380 case Intrinsic::aarch64_crc32h:
14381 case Intrinsic::aarch64_crc32ch:
14382 return tryCombineCRC32(0xffff, N, DAG);
14383 case Intrinsic::aarch64_sve_saddv:
14384 // There is no i64 version of SADDV because the sign is irrelevant.
14385 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
14386 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
14388 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
14389 case Intrinsic::aarch64_sve_uaddv:
14390 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
14391 case Intrinsic::aarch64_sve_smaxv:
14392 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
14393 case Intrinsic::aarch64_sve_umaxv:
14394 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
14395 case Intrinsic::aarch64_sve_sminv:
14396 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
14397 case Intrinsic::aarch64_sve_uminv:
14398 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
14399 case Intrinsic::aarch64_sve_orv:
14400 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
14401 case Intrinsic::aarch64_sve_eorv:
14402 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
14403 case Intrinsic::aarch64_sve_andv:
14404 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
14405 case Intrinsic::aarch64_sve_index:
14406 return LowerSVEIntrinsicIndex(N, DAG);
14407 case Intrinsic::aarch64_sve_dup:
14408 return LowerSVEIntrinsicDUP(N, DAG);
14409 case Intrinsic::aarch64_sve_dup_x:
14410 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
14412 case Intrinsic::aarch64_sve_ext:
14413 return LowerSVEIntrinsicEXT(N, DAG);
14414 case Intrinsic::aarch64_sve_mul:
14415 return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
14416 case Intrinsic::aarch64_sve_smulh:
14417 return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
14418 case Intrinsic::aarch64_sve_umulh:
14419 return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
14420 case Intrinsic::aarch64_sve_smin:
14421 return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
14422 case Intrinsic::aarch64_sve_umin:
14423 return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
14424 case Intrinsic::aarch64_sve_smax:
14425 return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
14426 case Intrinsic::aarch64_sve_umax:
14427 return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
14428 case Intrinsic::aarch64_sve_lsl:
14429 return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
14430 case Intrinsic::aarch64_sve_lsr:
14431 return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
14432 case Intrinsic::aarch64_sve_asr:
14433 return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
14434 case Intrinsic::aarch64_sve_fadd:
14435 return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
14436 case Intrinsic::aarch64_sve_fsub:
14437 return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
14438 case Intrinsic::aarch64_sve_fmul:
14439 return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
14440 case Intrinsic::aarch64_sve_add:
14441 return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
14442 case Intrinsic::aarch64_sve_sub:
14443 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
14444 case Intrinsic::aarch64_sve_and:
14445 return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
14446 case Intrinsic::aarch64_sve_bic:
14447 return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
14448 case Intrinsic::aarch64_sve_eor:
14449 return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
14450 case Intrinsic::aarch64_sve_orr:
14451 return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
14452 case Intrinsic::aarch64_sve_sqadd:
14453 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
14454 case Intrinsic::aarch64_sve_sqsub:
14455 return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true);
14456 case Intrinsic::aarch64_sve_uqadd:
14457 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
14458 case Intrinsic::aarch64_sve_uqsub:
14459 return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true);
14460 case Intrinsic::aarch64_sve_sqadd_x:
14461 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
14462 N->getOperand(1), N->getOperand(2));
14463 case Intrinsic::aarch64_sve_sqsub_x:
14464 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
14465 N->getOperand(1), N->getOperand(2));
14466 case Intrinsic::aarch64_sve_uqadd_x:
14467 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
14468 N->getOperand(1), N->getOperand(2));
14469 case Intrinsic::aarch64_sve_uqsub_x:
14470 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
14471 N->getOperand(1), N->getOperand(2));
14472 case Intrinsic::aarch64_sve_cmphs:
14473 if (!N->getOperand(2).getValueType().isFloatingPoint())
14474 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14475 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14476 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
14478 case Intrinsic::aarch64_sve_cmphi:
14479 if (!N->getOperand(2).getValueType().isFloatingPoint())
14480 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14481 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14482 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
14484 case Intrinsic::aarch64_sve_fcmpge:
14485 case Intrinsic::aarch64_sve_cmpge:
14486 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14487 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14488 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
14490 case Intrinsic::aarch64_sve_fcmpgt:
14491 case Intrinsic::aarch64_sve_cmpgt:
14492 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14493 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14494 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
14496 case Intrinsic::aarch64_sve_fcmpeq:
14497 case Intrinsic::aarch64_sve_cmpeq:
14498 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14499 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14500 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
14502 case Intrinsic::aarch64_sve_fcmpne:
14503 case Intrinsic::aarch64_sve_cmpne:
14504 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14505 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14506 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
14508 case Intrinsic::aarch64_sve_fcmpuo:
14509 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14510 N->getValueType(0), N->getOperand(1), N->getOperand(2),
14511 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
14513 case Intrinsic::aarch64_sve_fadda:
14514 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
14515 case Intrinsic::aarch64_sve_faddv:
14516 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
14517 case Intrinsic::aarch64_sve_fmaxnmv:
14518 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
14519 case Intrinsic::aarch64_sve_fmaxv:
14520 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
14521 case Intrinsic::aarch64_sve_fminnmv:
14522 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
14523 case Intrinsic::aarch64_sve_fminv:
14524 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
14525 case Intrinsic::aarch64_sve_sel:
14526 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
14527 N->getOperand(1), N->getOperand(2), N->getOperand(3));
14528 case Intrinsic::aarch64_sve_cmpeq_wide:
14529 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
14530 case Intrinsic::aarch64_sve_cmpne_wide:
14531 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
14532 case Intrinsic::aarch64_sve_cmpge_wide:
14533 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
14534 case Intrinsic::aarch64_sve_cmpgt_wide:
14535 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
14536 case Intrinsic::aarch64_sve_cmplt_wide:
14537 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
14538 case Intrinsic::aarch64_sve_cmple_wide:
14539 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
14540 case Intrinsic::aarch64_sve_cmphs_wide:
14541 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
14542 case Intrinsic::aarch64_sve_cmphi_wide:
14543 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
14544 case Intrinsic::aarch64_sve_cmplo_wide:
14545 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
14546 case Intrinsic::aarch64_sve_cmpls_wide:
14547 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
14548 case Intrinsic::aarch64_sve_ptest_any:
14549 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
14550 AArch64CC::ANY_ACTIVE);
14551 case Intrinsic::aarch64_sve_ptest_first:
14552 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
14553 AArch64CC::FIRST_ACTIVE);
14554 case Intrinsic::aarch64_sve_ptest_last:
14555 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
14556 AArch64CC::LAST_ACTIVE);
14561 static SDValue performExtendCombine(SDNode *N,
14562 TargetLowering::DAGCombinerInfo &DCI,
14563 SelectionDAG &DAG) {
14564 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
14565 // we can convert that DUP into another extract_high (of a bigger DUP), which
14566 // helps the backend to decide that an sabdl2 would be useful, saving a real
14567 // extract_high operation.
14568 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
14569 (N->getOperand(0).getOpcode() == ISD::ABDU ||
14570 N->getOperand(0).getOpcode() == ISD::ABDS)) {
14571 SDNode *ABDNode = N->getOperand(0).getNode();
14573 tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
14574 if (!NewABD.getNode())
14577 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
14582 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
14583 SDValue SplatVal, unsigned NumVecElts) {
14584 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
14585 unsigned OrigAlignment = St.getAlignment();
14586 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
14588 // Create scalar stores. This is at least as good as the code sequence for a
14589 // split unaligned store which is a dup.s, ext.b, and two stores.
14590 // Most of the time the three stores should be replaced by store pair
14591 // instructions (stp).
14593 SDValue BasePtr = St.getBasePtr();
14594 uint64_t BaseOffset = 0;
14596 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
14598 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
14599 OrigAlignment, St.getMemOperand()->getFlags());
14601 // As this in ISel, we will not merge this add which may degrade results.
14602 if (BasePtr->getOpcode() == ISD::ADD &&
14603 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
14604 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
14605 BasePtr = BasePtr->getOperand(0);
14608 unsigned Offset = EltOffset;
14609 while (--NumVecElts) {
14610 unsigned Alignment = MinAlign(OrigAlignment, Offset);
14611 SDValue OffsetPtr =
14612 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
14613 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
14614 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
14615 PtrInfo.getWithOffset(Offset), Alignment,
14616 St.getMemOperand()->getFlags());
14617 Offset += EltOffset;
14622 // Returns an SVE type that ContentTy can be trivially sign or zero extended
14624 static MVT getSVEContainerType(EVT ContentTy) {
14625 assert(ContentTy.isSimple() && "No SVE containers for extended types");
14627 switch (ContentTy.getSimpleVT().SimpleTy) {
14629 llvm_unreachable("No known SVE container for this MVT type");
14636 return MVT::nxv2i64;
14641 return MVT::nxv4i32;
14645 case MVT::nxv8bf16:
14646 return MVT::nxv8i16;
14648 return MVT::nxv16i8;
14652 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
14654 EVT VT = N->getValueType(0);
14656 if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
14659 EVT ContainerVT = VT;
14660 if (ContainerVT.isInteger())
14661 ContainerVT = getSVEContainerType(ContainerVT);
14663 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
14664 SDValue Ops[] = { N->getOperand(0), // Chain
14665 N->getOperand(2), // Pg
14666 N->getOperand(3), // Base
14667 DAG.getValueType(VT) };
14669 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
14670 SDValue LoadChain = SDValue(Load.getNode(), 1);
14672 if (ContainerVT.isInteger() && (VT != ContainerVT))
14673 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
14675 return DAG.getMergeValues({ Load, LoadChain }, DL);
14678 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
14680 EVT VT = N->getValueType(0);
14681 EVT PtrTy = N->getOperand(3).getValueType();
14683 if (VT == MVT::nxv8bf16 &&
14684 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14688 if (VT.isFloatingPoint())
14689 LoadVT = VT.changeTypeToInteger();
14691 auto *MINode = cast<MemIntrinsicSDNode>(N);
14692 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
14693 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
14694 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
14695 MINode->getOperand(2), PassThru,
14696 MINode->getMemoryVT(), MINode->getMemOperand(),
14697 ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
14699 if (VT.isFloatingPoint()) {
14700 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
14701 return DAG.getMergeValues(Ops, DL);
14707 template <unsigned Opcode>
14708 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
14709 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
14710 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
14711 "Unsupported opcode.");
14713 EVT VT = N->getValueType(0);
14714 if (VT == MVT::nxv8bf16 &&
14715 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14719 if (VT.isFloatingPoint())
14720 LoadVT = VT.changeTypeToInteger();
14722 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
14723 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
14724 SDValue LoadChain = SDValue(Load.getNode(), 1);
14726 if (VT.isFloatingPoint())
14727 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
14729 return DAG.getMergeValues({Load, LoadChain}, DL);
14732 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
14734 SDValue Data = N->getOperand(2);
14735 EVT DataVT = Data.getValueType();
14736 EVT HwSrcVt = getSVEContainerType(DataVT);
14737 SDValue InputVT = DAG.getValueType(DataVT);
14739 if (DataVT == MVT::nxv8bf16 &&
14740 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14743 if (DataVT.isFloatingPoint())
14744 InputVT = DAG.getValueType(HwSrcVt);
14747 if (Data.getValueType().isFloatingPoint())
14748 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
14750 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
14752 SDValue Ops[] = { N->getOperand(0), // Chain
14754 N->getOperand(4), // Base
14755 N->getOperand(3), // Pg
14759 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
14762 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
14765 SDValue Data = N->getOperand(2);
14766 EVT DataVT = Data.getValueType();
14767 EVT PtrTy = N->getOperand(4).getValueType();
14769 if (DataVT == MVT::nxv8bf16 &&
14770 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14773 if (DataVT.isFloatingPoint())
14774 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
14776 auto *MINode = cast<MemIntrinsicSDNode>(N);
14777 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
14778 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
14779 MINode->getMemoryVT(), MINode->getMemOperand(),
14780 ISD::UNINDEXED, false, false);
14783 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
14784 /// load store optimizer pass will merge them to store pair stores. This should
14785 /// be better than a movi to create the vector zero followed by a vector store
14786 /// if the zero constant is not re-used, since one instructions and one register
14787 /// live range will be removed.
14789 /// For example, the final generated code should be:
14791 /// stp xzr, xzr, [x0]
14798 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
14799 SDValue StVal = St.getValue();
14800 EVT VT = StVal.getValueType();
14802 // Avoid scalarizing zero splat stores for scalable vectors.
14803 if (VT.isScalableVector())
14806 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
14807 // 2, 3 or 4 i32 elements.
14808 int NumVecElts = VT.getVectorNumElements();
14809 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
14810 VT.getVectorElementType().getSizeInBits() == 64) ||
14811 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
14812 VT.getVectorElementType().getSizeInBits() == 32)))
14815 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
14818 // If the zero constant has more than one use then the vector store could be
14819 // better since the constant mov will be amortized and stp q instructions
14820 // should be able to be formed.
14821 if (!StVal.hasOneUse())
14824 // If the store is truncating then it's going down to i16 or smaller, which
14825 // means it can be implemented in a single store anyway.
14826 if (St.isTruncatingStore())
14829 // If the immediate offset of the address operand is too large for the stp
14830 // instruction, then bail out.
14831 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
14832 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
14833 if (Offset < -512 || Offset > 504)
14837 for (int I = 0; I < NumVecElts; ++I) {
14838 SDValue EltVal = StVal.getOperand(I);
14839 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
14843 // Use a CopyFromReg WZR/XZR here to prevent
14844 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
14848 if (VT.getVectorElementType().getSizeInBits() == 32) {
14849 ZeroReg = AArch64::WZR;
14852 ZeroReg = AArch64::XZR;
14856 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
14857 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14860 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
14861 /// value. The load store optimizer pass will merge them to store pair stores.
14862 /// This has better performance than a splat of the scalar followed by a split
14863 /// vector store. Even if the stores are not merged it is four stores vs a dup,
14864 /// followed by an ext.b and two stores.
14865 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
14866 SDValue StVal = St.getValue();
14867 EVT VT = StVal.getValueType();
14869 // Don't replace floating point stores, they possibly won't be transformed to
14870 // stp because of the store pair suppress pass.
14871 if (VT.isFloatingPoint())
14874 // We can express a splat as store pair(s) for 2 or 4 elements.
14875 unsigned NumVecElts = VT.getVectorNumElements();
14876 if (NumVecElts != 4 && NumVecElts != 2)
14879 // If the store is truncating then it's going down to i16 or smaller, which
14880 // means it can be implemented in a single store anyway.
14881 if (St.isTruncatingStore())
14884 // Check that this is a splat.
14885 // Make sure that each of the relevant vector element locations are inserted
14886 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
14887 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
14889 for (unsigned I = 0; I < NumVecElts; ++I) {
14890 // Check for insert vector elements.
14891 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
14894 // Check that same value is inserted at each vector element.
14896 SplatVal = StVal.getOperand(1);
14897 else if (StVal.getOperand(1) != SplatVal)
14900 // Check insert element index.
14901 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
14904 uint64_t IndexVal = CIndex->getZExtValue();
14905 if (IndexVal >= NumVecElts)
14907 IndexNotInserted.reset(IndexVal);
14909 StVal = StVal.getOperand(0);
14911 // Check that all vector element locations were inserted to.
14912 if (IndexNotInserted.any())
14915 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14918 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
14920 const AArch64Subtarget *Subtarget) {
14922 StoreSDNode *S = cast<StoreSDNode>(N);
14923 if (S->isVolatile() || S->isIndexed())
14926 SDValue StVal = S->getValue();
14927 EVT VT = StVal.getValueType();
14929 if (!VT.isFixedLengthVector())
14932 // If we get a splat of zeros, convert this vector store to a store of
14933 // scalars. They will be merged into store pairs of xzr thereby removing one
14934 // instruction and one register.
14935 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
14936 return ReplacedZeroSplat;
14938 // FIXME: The logic for deciding if an unaligned store should be split should
14939 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
14940 // a call to that function here.
14942 if (!Subtarget->isMisaligned128StoreSlow())
14945 // Don't split at -Oz.
14946 if (DAG.getMachineFunction().getFunction().hasMinSize())
14949 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
14950 // those up regresses performance on micro-benchmarks and olden/bh.
14951 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
14954 // Split unaligned 16B stores. They are terrible for performance.
14955 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
14956 // extensions can use this to mark that it does not want splitting to happen
14957 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
14958 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
14959 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
14960 S->getAlignment() <= 2)
14963 // If we get a splat of a scalar convert this vector store to a store of
14964 // scalars. They will be merged into store pairs thereby removing two
14966 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
14967 return ReplacedSplat;
14971 // Split VT into two.
14972 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14973 unsigned NumElts = HalfVT.getVectorNumElements();
14974 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
14975 DAG.getConstant(0, DL, MVT::i64));
14976 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
14977 DAG.getConstant(NumElts, DL, MVT::i64));
14978 SDValue BasePtr = S->getBasePtr();
14980 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
14981 S->getAlignment(), S->getMemOperand()->getFlags());
14982 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
14983 DAG.getConstant(8, DL, MVT::i64));
14984 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
14985 S->getPointerInfo(), S->getAlignment(),
14986 S->getMemOperand()->getFlags());
14989 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
14990 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
14992 // splice(pg, op1, undef) -> op1
14993 if (N->getOperand(2).isUndef())
14994 return N->getOperand(1);
14999 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
15001 SDValue Op0 = N->getOperand(0);
15002 SDValue Op1 = N->getOperand(1);
15003 EVT ResVT = N->getValueType(0);
15005 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
15006 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
15007 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
15008 SDValue X = Op0.getOperand(0).getOperand(0);
15009 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
15013 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
15014 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
15015 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
15016 SDValue Z = Op1.getOperand(0).getOperand(1);
15017 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
15024 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
15025 unsigned Opc = N->getOpcode();
15027 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
15028 Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
15029 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
15030 Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
15031 "Invalid opcode.");
15033 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
15034 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
15035 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
15036 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
15037 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
15038 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
15039 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
15040 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
15043 SDValue Chain = N->getOperand(0);
15044 SDValue Pg = N->getOperand(1);
15045 SDValue Base = N->getOperand(2);
15046 SDValue Offset = N->getOperand(3);
15047 SDValue Ty = N->getOperand(4);
15049 EVT ResVT = N->getValueType(0);
15051 const auto OffsetOpc = Offset.getOpcode();
15052 const bool OffsetIsZExt =
15053 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
15054 const bool OffsetIsSExt =
15055 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
15057 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
15058 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
15059 SDValue ExtPg = Offset.getOperand(0);
15060 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
15061 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
15063 // If the predicate for the sign- or zero-extended offset is the
15064 // same as the predicate used for this load and the sign-/zero-extension
15065 // was from a 32-bits...
15066 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
15067 SDValue UnextendedOffset = Offset.getOperand(1);
15069 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
15071 NewOpc = getSignExtendedGatherOpcode(NewOpc);
15073 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
15074 {Chain, Pg, Base, UnextendedOffset, Ty});
15081 /// Optimize a vector shift instruction and its operand if shifted out
15082 /// bits are not used.
15083 static SDValue performVectorShiftCombine(SDNode *N,
15084 const AArch64TargetLowering &TLI,
15085 TargetLowering::DAGCombinerInfo &DCI) {
15086 assert(N->getOpcode() == AArch64ISD::VASHR ||
15087 N->getOpcode() == AArch64ISD::VLSHR);
15089 SDValue Op = N->getOperand(0);
15090 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
15092 unsigned ShiftImm = N->getConstantOperandVal(1);
15093 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
15095 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
15096 APInt DemandedMask = ~ShiftedOutBits;
15098 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15099 return SDValue(N, 0);
15104 /// Target-specific DAG combine function for post-increment LD1 (lane) and
15105 /// post-increment LD1R.
15106 static SDValue performPostLD1Combine(SDNode *N,
15107 TargetLowering::DAGCombinerInfo &DCI,
15109 if (DCI.isBeforeLegalizeOps())
15112 SelectionDAG &DAG = DCI.DAG;
15113 EVT VT = N->getValueType(0);
15115 if (VT.isScalableVector())
15118 unsigned LoadIdx = IsLaneOp ? 1 : 0;
15119 SDNode *LD = N->getOperand(LoadIdx).getNode();
15120 // If it is not LOAD, can not do such combine.
15121 if (LD->getOpcode() != ISD::LOAD)
15124 // The vector lane must be a constant in the LD1LANE opcode.
15127 Lane = N->getOperand(2);
15128 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
15129 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
15133 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
15134 EVT MemVT = LoadSDN->getMemoryVT();
15135 // Check if memory operand is the same type as the vector element.
15136 if (MemVT != VT.getVectorElementType())
15139 // Check if there are other uses. If so, do not combine as it will introduce
15141 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
15143 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
15149 SDValue Addr = LD->getOperand(1);
15150 SDValue Vector = N->getOperand(0);
15151 // Search for a use of the address operand that is an increment.
15152 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
15153 Addr.getNode()->use_end(); UI != UE; ++UI) {
15154 SDNode *User = *UI;
15155 if (User->getOpcode() != ISD::ADD
15156 || UI.getUse().getResNo() != Addr.getResNo())
15159 // If the increment is a constant, it must match the memory ref size.
15160 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15161 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
15162 uint32_t IncVal = CInc->getZExtValue();
15163 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
15164 if (IncVal != NumBytes)
15166 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
15169 // To avoid cycle construction make sure that neither the load nor the add
15170 // are predecessors to each other or the Vector.
15171 SmallPtrSet<const SDNode *, 32> Visited;
15172 SmallVector<const SDNode *, 16> Worklist;
15173 Visited.insert(Addr.getNode());
15174 Worklist.push_back(User);
15175 Worklist.push_back(LD);
15176 Worklist.push_back(Vector.getNode());
15177 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
15178 SDNode::hasPredecessorHelper(User, Visited, Worklist))
15181 SmallVector<SDValue, 8> Ops;
15182 Ops.push_back(LD->getOperand(0)); // Chain
15184 Ops.push_back(Vector); // The vector to be inserted
15185 Ops.push_back(Lane); // The lane to be inserted in the vector
15187 Ops.push_back(Addr);
15188 Ops.push_back(Inc);
15190 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
15191 SDVTList SDTys = DAG.getVTList(Tys);
15192 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
15193 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
15195 LoadSDN->getMemOperand());
15197 // Update the uses.
15198 SDValue NewResults[] = {
15199 SDValue(LD, 0), // The result of load
15200 SDValue(UpdN.getNode(), 2) // Chain
15202 DCI.CombineTo(LD, NewResults);
15203 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
15204 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
15211 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
15212 /// address translation.
15213 static bool performTBISimplification(SDValue Addr,
15214 TargetLowering::DAGCombinerInfo &DCI,
15215 SelectionDAG &DAG) {
15216 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
15218 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
15219 !DCI.isBeforeLegalizeOps());
15220 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15221 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
15222 DCI.CommitTargetLoweringOpt(TLO);
15228 static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
15229 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
15230 "Expected STORE dag node in input!");
15232 if (auto Store = dyn_cast<StoreSDNode>(N)) {
15233 if (!Store->isTruncatingStore() || Store->isIndexed())
15235 SDValue Ext = Store->getValue();
15236 auto ExtOpCode = Ext.getOpcode();
15237 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
15238 ExtOpCode != ISD::ANY_EXTEND)
15240 SDValue Orig = Ext->getOperand(0);
15241 if (Store->getMemoryVT() != Orig->getValueType(0))
15243 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
15244 Store->getBasePtr(), Store->getPointerInfo(),
15245 Store->getAlign());
15251 static SDValue performSTORECombine(SDNode *N,
15252 TargetLowering::DAGCombinerInfo &DCI,
15254 const AArch64Subtarget *Subtarget) {
15255 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
15258 if (Subtarget->supportsAddressTopByteIgnored() &&
15259 performTBISimplification(N->getOperand(2), DCI, DAG))
15260 return SDValue(N, 0);
15262 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
15268 /// Target-specific DAG combine function for NEON load/store intrinsics
15269 /// to merge base address updates.
15270 static SDValue performNEONPostLDSTCombine(SDNode *N,
15271 TargetLowering::DAGCombinerInfo &DCI,
15272 SelectionDAG &DAG) {
15273 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15276 unsigned AddrOpIdx = N->getNumOperands() - 1;
15277 SDValue Addr = N->getOperand(AddrOpIdx);
15279 // Search for a use of the address operand that is an increment.
15280 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
15281 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
15282 SDNode *User = *UI;
15283 if (User->getOpcode() != ISD::ADD ||
15284 UI.getUse().getResNo() != Addr.getResNo())
15287 // Check that the add is independent of the load/store. Otherwise, folding
15288 // it would create a cycle.
15289 SmallPtrSet<const SDNode *, 32> Visited;
15290 SmallVector<const SDNode *, 16> Worklist;
15291 Visited.insert(Addr.getNode());
15292 Worklist.push_back(N);
15293 Worklist.push_back(User);
15294 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
15295 SDNode::hasPredecessorHelper(User, Visited, Worklist))
15298 // Find the new opcode for the updating load/store.
15299 bool IsStore = false;
15300 bool IsLaneOp = false;
15301 bool IsDupOp = false;
15302 unsigned NewOpc = 0;
15303 unsigned NumVecs = 0;
15304 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
15306 default: llvm_unreachable("unexpected intrinsic for Neon base update");
15307 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
15308 NumVecs = 2; break;
15309 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
15310 NumVecs = 3; break;
15311 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
15312 NumVecs = 4; break;
15313 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
15314 NumVecs = 2; IsStore = true; break;
15315 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
15316 NumVecs = 3; IsStore = true; break;
15317 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
15318 NumVecs = 4; IsStore = true; break;
15319 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
15320 NumVecs = 2; break;
15321 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
15322 NumVecs = 3; break;
15323 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
15324 NumVecs = 4; break;
15325 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
15326 NumVecs = 2; IsStore = true; break;
15327 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
15328 NumVecs = 3; IsStore = true; break;
15329 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
15330 NumVecs = 4; IsStore = true; break;
15331 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
15332 NumVecs = 2; IsDupOp = true; break;
15333 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
15334 NumVecs = 3; IsDupOp = true; break;
15335 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
15336 NumVecs = 4; IsDupOp = true; break;
15337 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
15338 NumVecs = 2; IsLaneOp = true; break;
15339 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
15340 NumVecs = 3; IsLaneOp = true; break;
15341 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
15342 NumVecs = 4; IsLaneOp = true; break;
15343 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
15344 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
15345 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
15346 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
15347 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
15348 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
15353 VecTy = N->getOperand(2).getValueType();
15355 VecTy = N->getValueType(0);
15357 // If the increment is a constant, it must match the memory ref size.
15358 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15359 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
15360 uint32_t IncVal = CInc->getZExtValue();
15361 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15362 if (IsLaneOp || IsDupOp)
15363 NumBytes /= VecTy.getVectorNumElements();
15364 if (IncVal != NumBytes)
15366 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
15368 SmallVector<SDValue, 8> Ops;
15369 Ops.push_back(N->getOperand(0)); // Incoming chain
15370 // Load lane and store have vector list as input.
15371 if (IsLaneOp || IsStore)
15372 for (unsigned i = 2; i < AddrOpIdx; ++i)
15373 Ops.push_back(N->getOperand(i));
15374 Ops.push_back(Addr); // Base register
15375 Ops.push_back(Inc);
15379 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
15381 for (n = 0; n < NumResultVecs; ++n)
15383 Tys[n++] = MVT::i64; // Type of write back register
15384 Tys[n] = MVT::Other; // Type of the chain
15385 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
15387 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
15388 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
15389 MemInt->getMemoryVT(),
15390 MemInt->getMemOperand());
15392 // Update the uses.
15393 std::vector<SDValue> NewResults;
15394 for (unsigned i = 0; i < NumResultVecs; ++i) {
15395 NewResults.push_back(SDValue(UpdN.getNode(), i));
15397 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
15398 DCI.CombineTo(N, NewResults);
15399 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
15406 // Checks to see if the value is the prescribed width and returns information
15407 // about its extension mode.
15409 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
15410 ExtType = ISD::NON_EXTLOAD;
15411 switch(V.getNode()->getOpcode()) {
15415 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
15416 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
15417 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
15418 ExtType = LoadNode->getExtensionType();
15423 case ISD::AssertSext: {
15424 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
15425 if ((TypeNode->getVT() == MVT::i8 && width == 8)
15426 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
15427 ExtType = ISD::SEXTLOAD;
15432 case ISD::AssertZext: {
15433 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
15434 if ((TypeNode->getVT() == MVT::i8 && width == 8)
15435 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
15436 ExtType = ISD::ZEXTLOAD;
15441 case ISD::Constant:
15442 case ISD::TargetConstant: {
15443 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
15444 1LL << (width - 1);
15451 // This function does a whole lot of voodoo to determine if the tests are
15452 // equivalent without and with a mask. Essentially what happens is that given a
15455 // +-------------+ +-------------+ +-------------+ +-------------+
15456 // | Input | | AddConstant | | CompConstant| | CC |
15457 // +-------------+ +-------------+ +-------------+ +-------------+
15459 // V V | +----------+
15460 // +-------------+ +----+ | |
15461 // | ADD | |0xff| | |
15462 // +-------------+ +----+ | |
15465 // +-------------+ | |
15467 // +-------------+ | |
15476 // The AND node may be safely removed for some combinations of inputs. In
15477 // particular we need to take into account the extension type of the Input,
15478 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
15479 // width of the input (this can work for any width inputs, the above graph is
15480 // specific to 8 bits.
15482 // The specific equations were worked out by generating output tables for each
15483 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
15484 // problem was simplified by working with 4 bit inputs, which means we only
15485 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
15486 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
15487 // patterns present in both extensions (0,7). For every distinct set of
15488 // AddConstant and CompConstants bit patterns we can consider the masked and
15489 // unmasked versions to be equivalent if the result of this function is true for
15490 // all 16 distinct bit patterns of for the current extension type of Input (w0).
15493 // and w10, w8, #0x0f
15495 // cset w9, AArch64CC
15497 // cset w11, AArch64CC
15502 // Since the above function shows when the outputs are equivalent it defines
15503 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
15504 // would be expensive to run during compiles. The equations below were written
15505 // in a test harness that confirmed they gave equivalent outputs to the above
15506 // for all inputs function, so they can be used determine if the removal is
15509 // isEquivalentMaskless() is the code for testing if the AND can be removed
15510 // factored out of the DAG recognition as the DAG can take several forms.
15512 static bool isEquivalentMaskless(unsigned CC, unsigned width,
15513 ISD::LoadExtType ExtType, int AddConstant,
15514 int CompConstant) {
15515 // By being careful about our equations and only writing the in term
15516 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
15517 // make them generally applicable to all bit widths.
15518 int MaxUInt = (1 << width);
15520 // For the purposes of these comparisons sign extending the type is
15521 // equivalent to zero extending the add and displacing it by half the integer
15522 // width. Provided we are careful and make sure our equations are valid over
15523 // the whole range we can just adjust the input and avoid writing equations
15524 // for sign extended inputs.
15525 if (ExtType == ISD::SEXTLOAD)
15526 AddConstant -= (1 << (width-1));
15529 case AArch64CC::LE:
15530 case AArch64CC::GT:
15531 if ((AddConstant == 0) ||
15532 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
15533 (AddConstant >= 0 && CompConstant < 0) ||
15534 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
15537 case AArch64CC::LT:
15538 case AArch64CC::GE:
15539 if ((AddConstant == 0) ||
15540 (AddConstant >= 0 && CompConstant <= 0) ||
15541 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
15544 case AArch64CC::HI:
15545 case AArch64CC::LS:
15546 if ((AddConstant >= 0 && CompConstant < 0) ||
15547 (AddConstant <= 0 && CompConstant >= -1 &&
15548 CompConstant < AddConstant + MaxUInt))
15551 case AArch64CC::PL:
15552 case AArch64CC::MI:
15553 if ((AddConstant == 0) ||
15554 (AddConstant > 0 && CompConstant <= 0) ||
15555 (AddConstant < 0 && CompConstant <= AddConstant))
15558 case AArch64CC::LO:
15559 case AArch64CC::HS:
15560 if ((AddConstant >= 0 && CompConstant <= 0) ||
15561 (AddConstant <= 0 && CompConstant >= 0 &&
15562 CompConstant <= AddConstant + MaxUInt))
15565 case AArch64CC::EQ:
15566 case AArch64CC::NE:
15567 if ((AddConstant > 0 && CompConstant < 0) ||
15568 (AddConstant < 0 && CompConstant >= 0 &&
15569 CompConstant < AddConstant + MaxUInt) ||
15570 (AddConstant >= 0 && CompConstant >= 0 &&
15571 CompConstant >= AddConstant) ||
15572 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
15575 case AArch64CC::VS:
15576 case AArch64CC::VC:
15577 case AArch64CC::AL:
15578 case AArch64CC::NV:
15580 case AArch64CC::Invalid:
15588 SDValue performCONDCombine(SDNode *N,
15589 TargetLowering::DAGCombinerInfo &DCI,
15590 SelectionDAG &DAG, unsigned CCIndex,
15591 unsigned CmpIndex) {
15592 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
15593 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
15594 unsigned CondOpcode = SubsNode->getOpcode();
15596 if (CondOpcode != AArch64ISD::SUBS)
15599 // There is a SUBS feeding this condition. Is it fed by a mask we can
15602 SDNode *AndNode = SubsNode->getOperand(0).getNode();
15603 unsigned MaskBits = 0;
15605 if (AndNode->getOpcode() != ISD::AND)
15608 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
15609 uint32_t CNV = CN->getZExtValue();
15612 else if (CNV == 65535)
15619 SDValue AddValue = AndNode->getOperand(0);
15621 if (AddValue.getOpcode() != ISD::ADD)
15624 // The basic dag structure is correct, grab the inputs and validate them.
15626 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
15627 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
15628 SDValue SubsInputValue = SubsNode->getOperand(1);
15630 // The mask is present and the provenance of all the values is a smaller type,
15631 // lets see if the mask is superfluous.
15633 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
15634 !isa<ConstantSDNode>(SubsInputValue.getNode()))
15637 ISD::LoadExtType ExtType;
15639 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
15640 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
15641 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
15644 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
15645 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
15646 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
15649 // The AND is not necessary, remove it.
15651 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
15652 SubsNode->getValueType(1));
15653 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
15655 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
15656 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
15658 return SDValue(N, 0);
15661 // Optimize compare with zero and branch.
15662 static SDValue performBRCONDCombine(SDNode *N,
15663 TargetLowering::DAGCombinerInfo &DCI,
15664 SelectionDAG &DAG) {
15665 MachineFunction &MF = DAG.getMachineFunction();
15666 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
15667 // will not be produced, as they are conditional branch instructions that do
15669 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
15672 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
15674 SDValue Chain = N->getOperand(0);
15675 SDValue Dest = N->getOperand(1);
15676 SDValue CCVal = N->getOperand(2);
15677 SDValue Cmp = N->getOperand(3);
15679 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
15680 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
15681 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
15684 unsigned CmpOpc = Cmp.getOpcode();
15685 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
15688 // Only attempt folding if there is only one use of the flag and no use of the
15690 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
15693 SDValue LHS = Cmp.getOperand(0);
15694 SDValue RHS = Cmp.getOperand(1);
15696 assert(LHS.getValueType() == RHS.getValueType() &&
15697 "Expected the value type to be the same for both operands!");
15698 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
15701 if (isNullConstant(LHS))
15702 std::swap(LHS, RHS);
15704 if (!isNullConstant(RHS))
15707 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
15708 LHS.getOpcode() == ISD::SRL)
15711 // Fold the compare into the branch instruction.
15713 if (CC == AArch64CC::EQ)
15714 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
15716 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
15718 // Do not add new nodes to DAG combiner worklist.
15719 DCI.CombineTo(N, BR, false);
15724 // Optimize CSEL instructions
15725 static SDValue performCSELCombine(SDNode *N,
15726 TargetLowering::DAGCombinerInfo &DCI,
15727 SelectionDAG &DAG) {
15728 // CSEL x, x, cc -> x
15729 if (N->getOperand(0) == N->getOperand(1))
15730 return N->getOperand(0);
15732 return performCONDCombine(N, DCI, DAG, 2, 3);
15735 static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
15736 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
15737 SDValue LHS = N->getOperand(0);
15738 SDValue RHS = N->getOperand(1);
15739 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
15741 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
15742 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
15743 LHS->getOpcode() == AArch64ISD::CSEL &&
15744 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
15745 LHS->hasOneUse()) {
15748 // Invert CSEL's condition.
15749 auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
15750 auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
15751 auto NewCond = getInvertedCondCode(OldCond);
15753 // csel 0, 1, !cond, X
15755 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
15756 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
15757 LHS.getOperand(3));
15758 return DAG.getZExtOrTrunc(CSEL, DL, N->getValueType(0));
15764 static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) {
15765 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
15766 "Unexpected opcode!");
15768 SDValue Pred = N->getOperand(0);
15769 SDValue LHS = N->getOperand(1);
15770 SDValue RHS = N->getOperand(2);
15771 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
15773 // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne
15774 // => inner setcc_merge_zero
15775 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
15776 LHS->getOpcode() == ISD::SIGN_EXTEND &&
15777 LHS->getOperand(0)->getValueType(0) == N->getValueType(0) &&
15778 LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
15779 LHS->getOperand(0)->getOperand(0) == Pred)
15780 return LHS->getOperand(0);
15785 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
15786 // as well as whether the test should be inverted. This code is required to
15787 // catch these cases (as opposed to standard dag combines) because
15788 // AArch64ISD::TBZ is matched during legalization.
15789 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
15790 SelectionDAG &DAG) {
15792 if (!Op->hasOneUse())
15795 // We don't handle undef/constant-fold cases below, as they should have
15796 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
15799 // (tbz (trunc x), b) -> (tbz x, b)
15800 // This case is just here to enable more of the below cases to be caught.
15801 if (Op->getOpcode() == ISD::TRUNCATE &&
15802 Bit < Op->getValueType(0).getSizeInBits()) {
15803 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15806 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
15807 if (Op->getOpcode() == ISD::ANY_EXTEND &&
15808 Bit < Op->getOperand(0).getValueSizeInBits()) {
15809 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15812 if (Op->getNumOperands() != 2)
15815 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
15819 switch (Op->getOpcode()) {
15823 // (tbz (and x, m), b) -> (tbz x, b)
15825 if ((C->getZExtValue() >> Bit) & 1)
15826 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15829 // (tbz (shl x, c), b) -> (tbz x, b-c)
15831 if (C->getZExtValue() <= Bit &&
15832 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
15833 Bit = Bit - C->getZExtValue();
15834 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15838 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
15840 Bit = Bit + C->getZExtValue();
15841 if (Bit >= Op->getValueType(0).getSizeInBits())
15842 Bit = Op->getValueType(0).getSizeInBits() - 1;
15843 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15845 // (tbz (srl x, c), b) -> (tbz x, b+c)
15847 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
15848 Bit = Bit + C->getZExtValue();
15849 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15853 // (tbz (xor x, -1), b) -> (tbnz x, b)
15855 if ((C->getZExtValue() >> Bit) & 1)
15857 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15861 // Optimize test single bit zero/non-zero and branch.
15862 static SDValue performTBZCombine(SDNode *N,
15863 TargetLowering::DAGCombinerInfo &DCI,
15864 SelectionDAG &DAG) {
15865 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
15866 bool Invert = false;
15867 SDValue TestSrc = N->getOperand(1);
15868 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
15870 if (TestSrc == NewTestSrc)
15873 unsigned NewOpc = N->getOpcode();
15875 if (NewOpc == AArch64ISD::TBZ)
15876 NewOpc = AArch64ISD::TBNZ;
15878 assert(NewOpc == AArch64ISD::TBNZ);
15879 NewOpc = AArch64ISD::TBZ;
15884 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
15885 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
15888 // vselect (v1i1 setcc) ->
15889 // vselect (v1iXX setcc) (XX is the size of the compared operand type)
15890 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
15891 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
15893 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
15894 SDValue N0 = N->getOperand(0);
15895 EVT CCVT = N0.getValueType();
15897 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
15898 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
15899 // supported types.
15900 SDValue SetCC = N->getOperand(0);
15901 if (SetCC.getOpcode() == ISD::SETCC &&
15902 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
15903 SDValue CmpLHS = SetCC.getOperand(0);
15904 EVT VT = CmpLHS.getValueType();
15905 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
15906 SDNode *SplatLHS = N->getOperand(1).getNode();
15907 SDNode *SplatRHS = N->getOperand(2).getNode();
15909 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
15912 makeArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
15913 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
15914 VT.getSimpleVT().SimpleTy) &&
15915 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
15916 SplatLHSVal.isOneValue() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
15917 ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
15918 unsigned NumElts = VT.getVectorNumElements();
15919 SmallVector<SDValue, 8> Ops(
15920 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
15921 VT.getScalarType()));
15922 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
15924 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
15925 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
15930 if (N0.getOpcode() != ISD::SETCC ||
15931 CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
15932 CCVT.getVectorElementType() != MVT::i1)
15935 EVT ResVT = N->getValueType(0);
15936 EVT CmpVT = N0.getOperand(0).getValueType();
15937 // Only combine when the result type is of the same size as the compared
15939 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
15942 SDValue IfTrue = N->getOperand(1);
15943 SDValue IfFalse = N->getOperand(2);
15944 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
15945 N0.getOperand(0), N0.getOperand(1),
15946 cast<CondCodeSDNode>(N0.getOperand(2))->get());
15947 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
15951 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
15952 /// the compare-mask instructions rather than going via NZCV, even if LHS and
15953 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
15954 /// with a vector one followed by a DUP shuffle on the result.
15955 static SDValue performSelectCombine(SDNode *N,
15956 TargetLowering::DAGCombinerInfo &DCI) {
15957 SelectionDAG &DAG = DCI.DAG;
15958 SDValue N0 = N->getOperand(0);
15959 EVT ResVT = N->getValueType(0);
15961 if (N0.getOpcode() != ISD::SETCC)
15964 if (ResVT.isScalableVector())
15967 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
15968 // scalar SetCCResultType. We also don't expect vectors, because we assume
15969 // that selects fed by vector SETCCs are canonicalized to VSELECT.
15970 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
15971 "Scalar-SETCC feeding SELECT has unexpected result type!");
15973 // If NumMaskElts == 0, the comparison is larger than select result. The
15974 // largest real NEON comparison is 64-bits per lane, which means the result is
15975 // at most 32-bits and an illegal vector. Just bail out for now.
15976 EVT SrcVT = N0.getOperand(0).getValueType();
15978 // Don't try to do this optimization when the setcc itself has i1 operands.
15979 // There are no legal vectors of i1, so this would be pointless.
15980 if (SrcVT == MVT::i1)
15983 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
15984 if (!ResVT.isVector() || NumMaskElts == 0)
15987 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
15988 EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
15990 // Also bail out if the vector CCVT isn't the same size as ResVT.
15991 // This can happen if the SETCC operand size doesn't divide the ResVT size
15992 // (e.g., f64 vs v3f32).
15993 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
15996 // Make sure we didn't create illegal types, if we're not supposed to.
15997 assert(DCI.isBeforeLegalize() ||
15998 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
16000 // First perform a vector comparison, where lane 0 is the one we're interested
16004 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
16006 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
16007 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
16009 // Now duplicate the comparison mask we want across all other lanes.
16010 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
16011 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
16012 Mask = DAG.getNode(ISD::BITCAST, DL,
16013 ResVT.changeVectorElementTypeToInteger(), Mask);
16015 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
16018 /// Get rid of unnecessary NVCASTs (that don't change the type).
16019 static SDValue performNVCASTCombine(SDNode *N) {
16020 if (N->getValueType(0) == N->getOperand(0).getValueType())
16021 return N->getOperand(0);
16026 // If all users of the globaladdr are of the form (globaladdr + constant), find
16027 // the smallest constant, fold it into the globaladdr's offset and rewrite the
16028 // globaladdr as (globaladdr + constant) - constant.
16029 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
16030 const AArch64Subtarget *Subtarget,
16031 const TargetMachine &TM) {
16032 auto *GN = cast<GlobalAddressSDNode>(N);
16033 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
16034 AArch64II::MO_NO_FLAG)
16037 uint64_t MinOffset = -1ull;
16038 for (SDNode *N : GN->uses()) {
16039 if (N->getOpcode() != ISD::ADD)
16041 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
16043 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
16046 MinOffset = std::min(MinOffset, C->getZExtValue());
16048 uint64_t Offset = MinOffset + GN->getOffset();
16050 // Require that the new offset is larger than the existing one. Otherwise, we
16051 // can end up oscillating between two possible DAGs, for example,
16052 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
16053 if (Offset <= uint64_t(GN->getOffset()))
16056 // Check whether folding this offset is legal. It must not go out of bounds of
16057 // the referenced object to avoid violating the code model, and must be
16058 // smaller than 2^21 because this is the largest offset expressible in all
16061 // This check also prevents us from folding negative offsets, which will end
16062 // up being treated in the same way as large positive ones. They could also
16063 // cause code model violations, and aren't really common enough to matter.
16064 if (Offset >= (1 << 21))
16067 const GlobalValue *GV = GN->getGlobal();
16068 Type *T = GV->getValueType();
16069 if (!T->isSized() ||
16070 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
16074 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
16075 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
16076 DAG.getConstant(MinOffset, DL, MVT::i64));
16079 // Turns the vector of indices into a vector of byte offstes by scaling Offset
16080 // by (BitWidth / 8).
16081 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
16082 SDLoc DL, unsigned BitWidth) {
16083 assert(Offset.getValueType().isScalableVector() &&
16084 "This method is only for scalable vectors of offsets");
16086 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
16087 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
16089 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
16092 /// Check if the value of \p OffsetInBytes can be used as an immediate for
16093 /// the gather load/prefetch and scatter store instructions with vector base and
16094 /// immediate offset addressing mode:
16096 /// [<Zn>.[S|D]{, #<imm>}]
16098 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
16099 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
16100 unsigned ScalarSizeInBytes) {
16101 // The immediate is not a multiple of the scalar size.
16102 if (OffsetInBytes % ScalarSizeInBytes)
16105 // The immediate is out of range.
16106 if (OffsetInBytes / ScalarSizeInBytes > 31)
16112 /// Check if the value of \p Offset represents a valid immediate for the SVE
16113 /// gather load/prefetch and scatter store instructiona with vector base and
16114 /// immediate offset addressing mode:
16116 /// [<Zn>.[S|D]{, #<imm>}]
16118 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
16119 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
16120 unsigned ScalarSizeInBytes) {
16121 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
16122 return OffsetConst && isValidImmForSVEVecImmAddrMode(
16123 OffsetConst->getZExtValue(), ScalarSizeInBytes);
16126 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
16128 bool OnlyPackedOffsets = true) {
16129 const SDValue Src = N->getOperand(2);
16130 const EVT SrcVT = Src->getValueType(0);
16131 assert(SrcVT.isScalableVector() &&
16132 "Scatter stores are only possible for SVE vectors");
16135 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
16137 // Make sure that source data will fit into an SVE register
16138 if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
16141 // For FPs, ACLE only supports _packed_ single and double precision types.
16142 if (SrcElVT.isFloatingPoint())
16143 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
16146 // Depending on the addressing mode, this is either a pointer or a vector of
16147 // pointers (that fits into one register)
16148 SDValue Base = N->getOperand(4);
16149 // Depending on the addressing mode, this is either a single offset or a
16150 // vector of offsets (that fits into one register)
16151 SDValue Offset = N->getOperand(5);
16153 // For "scalar + vector of indices", just scale the indices. This only
16154 // applies to non-temporal scatters because there's no instruction that takes
16156 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
16158 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
16159 Opcode = AArch64ISD::SSTNT1_PRED;
16162 // In the case of non-temporal gather loads there's only one SVE instruction
16163 // per data-size: "scalar + vector", i.e.
16164 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
16165 // Since we do have intrinsics that allow the arguments to be in a different
16166 // order, we may need to swap them to match the spec.
16167 if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
16168 std::swap(Base, Offset);
16170 // SST1_IMM requires that the offset is an immediate that is:
16171 // * a multiple of #SizeInBytes,
16172 // * in the range [0, 31 x #SizeInBytes],
16173 // where #SizeInBytes is the size in bytes of the stored items. For
16174 // immediates outside that range and non-immediate scalar offsets use SST1 or
16175 // SST1_UXTW instead.
16176 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
16177 if (!isValidImmForSVEVecImmAddrMode(Offset,
16178 SrcVT.getScalarSizeInBits() / 8)) {
16179 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
16180 Opcode = AArch64ISD::SST1_UXTW_PRED;
16182 Opcode = AArch64ISD::SST1_PRED;
16184 std::swap(Base, Offset);
16188 auto &TLI = DAG.getTargetLoweringInfo();
16189 if (!TLI.isTypeLegal(Base.getValueType()))
16192 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
16193 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
16194 // nxv2i64. Legalize accordingly.
16195 if (!OnlyPackedOffsets &&
16196 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
16197 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
16199 if (!TLI.isTypeLegal(Offset.getValueType()))
16202 // Source value type that is representable in hardware
16203 EVT HwSrcVt = getSVEContainerType(SrcVT);
16205 // Keep the original type of the input data to store - this is needed to be
16206 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
16207 // FP values we want the integer equivalent, so just use HwSrcVt.
16208 SDValue InputVT = DAG.getValueType(SrcVT);
16209 if (SrcVT.isFloatingPoint())
16210 InputVT = DAG.getValueType(HwSrcVt);
16212 SDVTList VTs = DAG.getVTList(MVT::Other);
16215 if (Src.getValueType().isFloatingPoint())
16216 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
16218 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
16220 SDValue Ops[] = {N->getOperand(0), // Chain
16222 N->getOperand(3), // Pg
16227 return DAG.getNode(Opcode, DL, VTs, Ops);
16230 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
16232 bool OnlyPackedOffsets = true) {
16233 const EVT RetVT = N->getValueType(0);
16234 assert(RetVT.isScalableVector() &&
16235 "Gather loads are only possible for SVE vectors");
16239 // Make sure that the loaded data will fit into an SVE register
16240 if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
16243 // Depending on the addressing mode, this is either a pointer or a vector of
16244 // pointers (that fits into one register)
16245 SDValue Base = N->getOperand(3);
16246 // Depending on the addressing mode, this is either a single offset or a
16247 // vector of offsets (that fits into one register)
16248 SDValue Offset = N->getOperand(4);
16250 // For "scalar + vector of indices", just scale the indices. This only
16251 // applies to non-temporal gathers because there's no instruction that takes
16253 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
16254 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
16255 RetVT.getScalarSizeInBits());
16256 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
16259 // In the case of non-temporal gather loads there's only one SVE instruction
16260 // per data-size: "scalar + vector", i.e.
16261 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
16262 // Since we do have intrinsics that allow the arguments to be in a different
16263 // order, we may need to swap them to match the spec.
16264 if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
16265 Offset.getValueType().isVector())
16266 std::swap(Base, Offset);
16268 // GLD{FF}1_IMM requires that the offset is an immediate that is:
16269 // * a multiple of #SizeInBytes,
16270 // * in the range [0, 31 x #SizeInBytes],
16271 // where #SizeInBytes is the size in bytes of the loaded items. For
16272 // immediates outside that range and non-immediate scalar offsets use
16273 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
16274 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
16275 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
16276 if (!isValidImmForSVEVecImmAddrMode(Offset,
16277 RetVT.getScalarSizeInBits() / 8)) {
16278 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
16279 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
16280 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
16281 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
16283 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
16284 ? AArch64ISD::GLD1_MERGE_ZERO
16285 : AArch64ISD::GLDFF1_MERGE_ZERO;
16287 std::swap(Base, Offset);
16291 auto &TLI = DAG.getTargetLoweringInfo();
16292 if (!TLI.isTypeLegal(Base.getValueType()))
16295 // Some gather load variants allow unpacked offsets, but only as nxv2i32
16296 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
16297 // nxv2i64. Legalize accordingly.
16298 if (!OnlyPackedOffsets &&
16299 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
16300 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
16302 // Return value type that is representable in hardware
16303 EVT HwRetVt = getSVEContainerType(RetVT);
16305 // Keep the original output value type around - this is needed to be able to
16306 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
16307 // values we want the integer equivalent, so just use HwRetVT.
16308 SDValue OutVT = DAG.getValueType(RetVT);
16309 if (RetVT.isFloatingPoint())
16310 OutVT = DAG.getValueType(HwRetVt);
16312 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
16313 SDValue Ops[] = {N->getOperand(0), // Chain
16314 N->getOperand(2), // Pg
16315 Base, Offset, OutVT};
16317 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
16318 SDValue LoadChain = SDValue(Load.getNode(), 1);
16320 if (RetVT.isInteger() && (RetVT != HwRetVt))
16321 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
16323 // If the original return value was FP, bitcast accordingly. Doing it here
16324 // means that we can avoid adding TableGen patterns for FPs.
16325 if (RetVT.isFloatingPoint())
16326 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
16328 return DAG.getMergeValues({Load, LoadChain}, DL);
16332 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16333 SelectionDAG &DAG) {
16335 SDValue Src = N->getOperand(0);
16336 unsigned Opc = Src->getOpcode();
16338 // Sign extend of an unsigned unpack -> signed unpack
16339 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
16341 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
16342 : AArch64ISD::SUNPKLO;
16344 // Push the sign extend to the operand of the unpack
16345 // This is necessary where, for example, the operand of the unpack
16346 // is another unpack:
16347 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
16349 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
16351 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
16352 SDValue ExtOp = Src->getOperand(0);
16353 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
16354 EVT EltTy = VT.getVectorElementType();
16357 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
16358 "Sign extending from an invalid type");
16360 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
16362 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
16363 ExtOp, DAG.getValueType(ExtVT));
16365 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
16368 if (DCI.isBeforeLegalizeOps())
16371 if (!EnableCombineMGatherIntrinsics)
16374 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
16375 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
16377 unsigned MemVTOpNum = 4;
16379 case AArch64ISD::LD1_MERGE_ZERO:
16380 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
16383 case AArch64ISD::LDNF1_MERGE_ZERO:
16384 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
16387 case AArch64ISD::LDFF1_MERGE_ZERO:
16388 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
16391 case AArch64ISD::GLD1_MERGE_ZERO:
16392 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
16394 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
16395 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
16397 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
16398 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
16400 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
16401 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
16403 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
16404 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
16406 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
16407 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
16409 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
16410 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
16412 case AArch64ISD::GLDFF1_MERGE_ZERO:
16413 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
16415 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
16416 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
16418 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
16419 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
16421 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
16422 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
16424 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
16425 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
16427 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
16428 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
16430 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
16431 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
16433 case AArch64ISD::GLDNT1_MERGE_ZERO:
16434 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
16440 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
16441 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
16443 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
16446 EVT DstVT = N->getValueType(0);
16447 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
16449 SmallVector<SDValue, 5> Ops;
16450 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
16451 Ops.push_back(Src->getOperand(I));
16453 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
16454 DCI.CombineTo(N, ExtLoad);
16455 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
16457 // Return N so it doesn't get rechecked
16458 return SDValue(N, 0);
16461 /// Legalize the gather prefetch (scalar + vector addressing mode) when the
16462 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
16463 /// != nxv2i32) do not need legalization.
16464 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
16465 const unsigned OffsetPos = 4;
16466 SDValue Offset = N->getOperand(OffsetPos);
16468 // Not an unpacked vector, bail out.
16469 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
16472 // Extend the unpacked offset vector to 64-bit lanes.
16474 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
16475 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
16476 // Replace the offset operand with the 64-bit one.
16477 Ops[OffsetPos] = Offset;
16479 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
16482 /// Combines a node carrying the intrinsic
16483 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
16484 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
16485 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
16486 /// sve gather prefetch instruction with vector plus immediate addressing mode.
16487 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
16488 unsigned ScalarSizeInBytes) {
16489 const unsigned ImmPos = 4, OffsetPos = 3;
16490 // No need to combine the node if the immediate is valid...
16491 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
16494 // ...otherwise swap the offset base with the offset...
16495 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
16496 std::swap(Ops[ImmPos], Ops[OffsetPos]);
16497 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
16498 // `aarch64_sve_prfb_gather_uxtw_index`.
16500 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
16503 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
16506 // Return true if the vector operation can guarantee only the first lane of its
16507 // result contains data, with all bits in other lanes set to zero.
16508 static bool isLanes1toNKnownZero(SDValue Op) {
16509 switch (Op.getOpcode()) {
16512 case AArch64ISD::ANDV_PRED:
16513 case AArch64ISD::EORV_PRED:
16514 case AArch64ISD::FADDA_PRED:
16515 case AArch64ISD::FADDV_PRED:
16516 case AArch64ISD::FMAXNMV_PRED:
16517 case AArch64ISD::FMAXV_PRED:
16518 case AArch64ISD::FMINNMV_PRED:
16519 case AArch64ISD::FMINV_PRED:
16520 case AArch64ISD::ORV_PRED:
16521 case AArch64ISD::SADDV_PRED:
16522 case AArch64ISD::SMAXV_PRED:
16523 case AArch64ISD::SMINV_PRED:
16524 case AArch64ISD::UADDV_PRED:
16525 case AArch64ISD::UMAXV_PRED:
16526 case AArch64ISD::UMINV_PRED:
16531 static SDValue removeRedundantInsertVectorElt(SDNode *N) {
16532 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
16533 SDValue InsertVec = N->getOperand(0);
16534 SDValue InsertElt = N->getOperand(1);
16535 SDValue InsertIdx = N->getOperand(2);
16537 // We only care about inserts into the first element...
16538 if (!isNullConstant(InsertIdx))
16540 // ...of a zero'd vector...
16541 if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
16543 // ...where the inserted data was previously extracted...
16544 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16547 SDValue ExtractVec = InsertElt.getOperand(0);
16548 SDValue ExtractIdx = InsertElt.getOperand(1);
16550 // ...from the first element of a vector.
16551 if (!isNullConstant(ExtractIdx))
16554 // If we get here we are effectively trying to zero lanes 1-N of a vector.
16556 // Ensure there's no type conversion going on.
16557 if (N->getValueType(0) != ExtractVec.getValueType())
16560 if (!isLanes1toNKnownZero(ExtractVec))
16563 // The explicit zeroing is redundant.
16568 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
16569 if (SDValue Res = removeRedundantInsertVectorElt(N))
16572 return performPostLD1Combine(N, DCI, true);
16575 SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
16576 EVT Ty = N->getValueType(0);
16577 if (Ty.isInteger())
16580 EVT IntTy = Ty.changeVectorElementTypeToInteger();
16581 EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
16582 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
16583 IntTy.getVectorElementType().getScalarSizeInBits())
16587 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
16589 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
16591 SDValue Idx = N->getOperand(2);
16592 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
16593 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
16594 return DAG.getBitcast(Ty, Trunc);
16597 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
16598 DAGCombinerInfo &DCI) const {
16599 SelectionDAG &DAG = DCI.DAG;
16600 switch (N->getOpcode()) {
16602 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
16606 return performAddSubCombine(N, DCI, DAG);
16608 return performXorCombine(N, DAG, DCI, Subtarget);
16610 return performMulCombine(N, DAG, DCI, Subtarget);
16611 case ISD::SINT_TO_FP:
16612 case ISD::UINT_TO_FP:
16613 return performIntToFpCombine(N, DAG, Subtarget);
16614 case ISD::FP_TO_SINT:
16615 case ISD::FP_TO_UINT:
16616 return performFpToIntCombine(N, DAG, DCI, Subtarget);
16618 return performFDivCombine(N, DAG, DCI, Subtarget);
16620 return performORCombine(N, DCI, Subtarget);
16622 return performANDCombine(N, DCI);
16624 return performSRLCombine(N, DCI);
16625 case ISD::INTRINSIC_WO_CHAIN:
16626 return performIntrinsicCombine(N, DCI, Subtarget);
16627 case ISD::ANY_EXTEND:
16628 case ISD::ZERO_EXTEND:
16629 case ISD::SIGN_EXTEND:
16630 return performExtendCombine(N, DCI, DAG);
16631 case ISD::SIGN_EXTEND_INREG:
16632 return performSignExtendInRegCombine(N, DCI, DAG);
16633 case ISD::TRUNCATE:
16634 return performVectorTruncateCombine(N, DCI, DAG);
16635 case ISD::CONCAT_VECTORS:
16636 return performConcatVectorsCombine(N, DCI, DAG);
16638 return performSelectCombine(N, DCI);
16640 return performVSelectCombine(N, DCI.DAG);
16642 return performSETCCCombine(N, DAG);
16644 if (performTBISimplification(N->getOperand(1), DCI, DAG))
16645 return SDValue(N, 0);
16648 return performSTORECombine(N, DCI, DAG, Subtarget);
16649 case ISD::VECTOR_SPLICE:
16650 return performSVESpliceCombine(N, DAG);
16651 case AArch64ISD::BRCOND:
16652 return performBRCONDCombine(N, DCI, DAG);
16653 case AArch64ISD::TBNZ:
16654 case AArch64ISD::TBZ:
16655 return performTBZCombine(N, DCI, DAG);
16656 case AArch64ISD::CSEL:
16657 return performCSELCombine(N, DCI, DAG);
16658 case AArch64ISD::DUP:
16659 return performPostLD1Combine(N, DCI, false);
16660 case AArch64ISD::NVCAST:
16661 return performNVCASTCombine(N);
16662 case AArch64ISD::SPLICE:
16663 return performSpliceCombine(N, DAG);
16664 case AArch64ISD::UZP1:
16665 return performUzpCombine(N, DAG);
16666 case AArch64ISD::SETCC_MERGE_ZERO:
16667 return performSetccMergeZeroCombine(N, DAG);
16668 case AArch64ISD::GLD1_MERGE_ZERO:
16669 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
16670 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
16671 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
16672 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
16673 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
16674 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
16675 case AArch64ISD::GLD1S_MERGE_ZERO:
16676 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
16677 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
16678 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
16679 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
16680 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
16681 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
16682 return performGLD1Combine(N, DAG);
16683 case AArch64ISD::VASHR:
16684 case AArch64ISD::VLSHR:
16685 return performVectorShiftCombine(N, *this, DCI);
16686 case ISD::INSERT_VECTOR_ELT:
16687 return performInsertVectorEltCombine(N, DCI);
16688 case ISD::EXTRACT_VECTOR_ELT:
16689 return performExtractVectorEltCombine(N, DAG);
16690 case ISD::VECREDUCE_ADD:
16691 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
16692 case ISD::INTRINSIC_VOID:
16693 case ISD::INTRINSIC_W_CHAIN:
16694 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
16695 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
16696 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
16697 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
16698 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
16699 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
16700 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
16701 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
16702 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
16703 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
16704 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
16705 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
16706 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
16707 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
16708 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
16709 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
16710 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
16711 return legalizeSVEGatherPrefetchOffsVec(N, DAG);
16712 case Intrinsic::aarch64_neon_ld2:
16713 case Intrinsic::aarch64_neon_ld3:
16714 case Intrinsic::aarch64_neon_ld4:
16715 case Intrinsic::aarch64_neon_ld1x2:
16716 case Intrinsic::aarch64_neon_ld1x3:
16717 case Intrinsic::aarch64_neon_ld1x4:
16718 case Intrinsic::aarch64_neon_ld2lane:
16719 case Intrinsic::aarch64_neon_ld3lane:
16720 case Intrinsic::aarch64_neon_ld4lane:
16721 case Intrinsic::aarch64_neon_ld2r:
16722 case Intrinsic::aarch64_neon_ld3r:
16723 case Intrinsic::aarch64_neon_ld4r:
16724 case Intrinsic::aarch64_neon_st2:
16725 case Intrinsic::aarch64_neon_st3:
16726 case Intrinsic::aarch64_neon_st4:
16727 case Intrinsic::aarch64_neon_st1x2:
16728 case Intrinsic::aarch64_neon_st1x3:
16729 case Intrinsic::aarch64_neon_st1x4:
16730 case Intrinsic::aarch64_neon_st2lane:
16731 case Intrinsic::aarch64_neon_st3lane:
16732 case Intrinsic::aarch64_neon_st4lane:
16733 return performNEONPostLDSTCombine(N, DCI, DAG);
16734 case Intrinsic::aarch64_sve_ldnt1:
16735 return performLDNT1Combine(N, DAG);
16736 case Intrinsic::aarch64_sve_ld1rq:
16737 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
16738 case Intrinsic::aarch64_sve_ld1ro:
16739 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
16740 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
16741 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
16742 case Intrinsic::aarch64_sve_ldnt1_gather:
16743 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
16744 case Intrinsic::aarch64_sve_ldnt1_gather_index:
16745 return performGatherLoadCombine(N, DAG,
16746 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
16747 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
16748 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
16749 case Intrinsic::aarch64_sve_ld1:
16750 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
16751 case Intrinsic::aarch64_sve_ldnf1:
16752 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
16753 case Intrinsic::aarch64_sve_ldff1:
16754 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
16755 case Intrinsic::aarch64_sve_st1:
16756 return performST1Combine(N, DAG);
16757 case Intrinsic::aarch64_sve_stnt1:
16758 return performSTNT1Combine(N, DAG);
16759 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
16760 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
16761 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
16762 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
16763 case Intrinsic::aarch64_sve_stnt1_scatter:
16764 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
16765 case Intrinsic::aarch64_sve_stnt1_scatter_index:
16766 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
16767 case Intrinsic::aarch64_sve_ld1_gather:
16768 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
16769 case Intrinsic::aarch64_sve_ld1_gather_index:
16770 return performGatherLoadCombine(N, DAG,
16771 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
16772 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
16773 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
16774 /*OnlyPackedOffsets=*/false);
16775 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
16776 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
16777 /*OnlyPackedOffsets=*/false);
16778 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
16779 return performGatherLoadCombine(N, DAG,
16780 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
16781 /*OnlyPackedOffsets=*/false);
16782 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
16783 return performGatherLoadCombine(N, DAG,
16784 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
16785 /*OnlyPackedOffsets=*/false);
16786 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
16787 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
16788 case Intrinsic::aarch64_sve_ldff1_gather:
16789 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
16790 case Intrinsic::aarch64_sve_ldff1_gather_index:
16791 return performGatherLoadCombine(N, DAG,
16792 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
16793 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
16794 return performGatherLoadCombine(N, DAG,
16795 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
16796 /*OnlyPackedOffsets=*/false);
16797 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
16798 return performGatherLoadCombine(N, DAG,
16799 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
16800 /*OnlyPackedOffsets=*/false);
16801 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
16802 return performGatherLoadCombine(N, DAG,
16803 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
16804 /*OnlyPackedOffsets=*/false);
16805 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
16806 return performGatherLoadCombine(N, DAG,
16807 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
16808 /*OnlyPackedOffsets=*/false);
16809 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
16810 return performGatherLoadCombine(N, DAG,
16811 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
16812 case Intrinsic::aarch64_sve_st1_scatter:
16813 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
16814 case Intrinsic::aarch64_sve_st1_scatter_index:
16815 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
16816 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
16817 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
16818 /*OnlyPackedOffsets=*/false);
16819 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
16820 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
16821 /*OnlyPackedOffsets=*/false);
16822 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
16823 return performScatterStoreCombine(N, DAG,
16824 AArch64ISD::SST1_SXTW_SCALED_PRED,
16825 /*OnlyPackedOffsets=*/false);
16826 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
16827 return performScatterStoreCombine(N, DAG,
16828 AArch64ISD::SST1_UXTW_SCALED_PRED,
16829 /*OnlyPackedOffsets=*/false);
16830 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
16831 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
16832 case Intrinsic::aarch64_sve_tuple_get: {
16834 SDValue Chain = N->getOperand(0);
16835 SDValue Src1 = N->getOperand(2);
16836 SDValue Idx = N->getOperand(3);
16838 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
16839 EVT ResVT = N->getValueType(0);
16840 uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue();
16841 SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL);
16843 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx);
16844 return DAG.getMergeValues({Val, Chain}, DL);
16846 case Intrinsic::aarch64_sve_tuple_set: {
16848 SDValue Chain = N->getOperand(0);
16849 SDValue Tuple = N->getOperand(2);
16850 SDValue Idx = N->getOperand(3);
16851 SDValue Vec = N->getOperand(4);
16853 EVT TupleVT = Tuple.getValueType();
16854 uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue();
16856 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
16857 uint64_t NumLanes =
16858 Vec.getValueType().getVectorElementCount().getKnownMinValue();
16860 if ((TupleLanes % NumLanes) != 0)
16861 report_fatal_error("invalid tuple vector!");
16863 uint64_t NumVecs = TupleLanes / NumLanes;
16865 SmallVector<SDValue, 4> Opnds;
16866 for (unsigned I = 0; I < NumVecs; ++I) {
16868 Opnds.push_back(Vec);
16870 SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL);
16871 Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
16872 Vec.getValueType(), Tuple, ExtIdx));
16876 DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
16877 return DAG.getMergeValues({Concat, Chain}, DL);
16879 case Intrinsic::aarch64_sve_tuple_create2:
16880 case Intrinsic::aarch64_sve_tuple_create3:
16881 case Intrinsic::aarch64_sve_tuple_create4: {
16883 SDValue Chain = N->getOperand(0);
16885 SmallVector<SDValue, 4> Opnds;
16886 for (unsigned I = 2; I < N->getNumOperands(); ++I)
16887 Opnds.push_back(N->getOperand(I));
16889 EVT VT = Opnds[0].getValueType();
16890 EVT EltVT = VT.getVectorElementType();
16891 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
16892 VT.getVectorElementCount() *
16893 (N->getNumOperands() - 2));
16894 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
16895 return DAG.getMergeValues({Concat, Chain}, DL);
16897 case Intrinsic::aarch64_sve_ld2:
16898 case Intrinsic::aarch64_sve_ld3:
16899 case Intrinsic::aarch64_sve_ld4: {
16901 SDValue Chain = N->getOperand(0);
16902 SDValue Mask = N->getOperand(2);
16903 SDValue BasePtr = N->getOperand(3);
16904 SDValue LoadOps[] = {Chain, Mask, BasePtr};
16905 unsigned IntrinsicID =
16906 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
16908 LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
16909 return DAG.getMergeValues({Result, Chain}, DL);
16911 case Intrinsic::aarch64_rndr:
16912 case Intrinsic::aarch64_rndrrs: {
16913 unsigned IntrinsicID =
16914 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
16916 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
16917 : AArch64SysReg::RNDRRS);
16919 SDValue A = DAG.getNode(
16920 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
16921 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
16922 SDValue B = DAG.getNode(
16923 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
16924 DAG.getConstant(0, DL, MVT::i32),
16925 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
16926 return DAG.getMergeValues(
16927 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
16933 case ISD::GlobalAddress:
16934 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
16939 // Check if the return value is used as only a return value, as otherwise
16940 // we can't perform a tail-call. In particular, we need to check for
16941 // target ISD nodes that are returns and any other "odd" constructs
16942 // that the generic analysis code won't necessarily catch.
16943 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
16944 SDValue &Chain) const {
16945 if (N->getNumValues() != 1)
16947 if (!N->hasNUsesOfValue(1, 0))
16950 SDValue TCChain = Chain;
16951 SDNode *Copy = *N->use_begin();
16952 if (Copy->getOpcode() == ISD::CopyToReg) {
16953 // If the copy has a glue operand, we conservatively assume it isn't safe to
16954 // perform a tail call.
16955 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
16958 TCChain = Copy->getOperand(0);
16959 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
16962 bool HasRet = false;
16963 for (SDNode *Node : Copy->uses()) {
16964 if (Node->getOpcode() != AArch64ISD::RET_FLAG)
16976 // Return whether the an instruction can potentially be optimized to a tail
16977 // call. This will cause the optimizers to attempt to move, or duplicate,
16978 // return instructions to help enable tail call optimizations for this
16980 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
16981 return CI->isTailCall();
16984 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
16986 ISD::MemIndexedMode &AM,
16988 SelectionDAG &DAG) const {
16989 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
16992 Base = Op->getOperand(0);
16993 // All of the indexed addressing mode instructions take a signed
16994 // 9 bit immediate offset.
16995 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
16996 int64_t RHSC = RHS->getSExtValue();
16997 if (Op->getOpcode() == ISD::SUB)
16998 RHSC = -(uint64_t)RHSC;
16999 if (!isInt<9>(RHSC))
17001 IsInc = (Op->getOpcode() == ISD::ADD);
17002 Offset = Op->getOperand(1);
17008 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
17010 ISD::MemIndexedMode &AM,
17011 SelectionDAG &DAG) const {
17014 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
17015 VT = LD->getMemoryVT();
17016 Ptr = LD->getBasePtr();
17017 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
17018 VT = ST->getMemoryVT();
17019 Ptr = ST->getBasePtr();
17024 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
17026 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
17030 bool AArch64TargetLowering::getPostIndexedAddressParts(
17031 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
17032 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
17035 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
17036 VT = LD->getMemoryVT();
17037 Ptr = LD->getBasePtr();
17038 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
17039 VT = ST->getMemoryVT();
17040 Ptr = ST->getBasePtr();
17045 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
17047 // Post-indexing updates the base, so it's not a valid transform
17048 // if that's not the same as the load's pointer.
17051 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
17055 void AArch64TargetLowering::ReplaceBITCASTResults(
17056 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
17058 SDValue Op = N->getOperand(0);
17059 EVT VT = N->getValueType(0);
17060 EVT SrcVT = Op.getValueType();
17062 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
17063 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
17064 "Expected fp->int bitcast!");
17065 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
17066 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
17070 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
17074 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
17075 DAG.getUNDEF(MVT::i32), Op,
17076 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
17078 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
17079 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
17082 static void ReplaceReductionResults(SDNode *N,
17083 SmallVectorImpl<SDValue> &Results,
17084 SelectionDAG &DAG, unsigned InterOp,
17085 unsigned AcrossOp) {
17089 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
17090 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
17091 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
17092 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
17093 Results.push_back(SplitVal);
17096 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
17098 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
17099 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
17100 DAG.getNode(ISD::SRL, DL, MVT::i128, N,
17101 DAG.getConstant(64, DL, MVT::i64)));
17102 return std::make_pair(Lo, Hi);
17105 void AArch64TargetLowering::ReplaceExtractSubVectorResults(
17106 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
17107 SDValue In = N->getOperand(0);
17108 EVT InVT = In.getValueType();
17110 // Common code will handle these just fine.
17111 if (!InVT.isScalableVector() || !InVT.isInteger())
17115 EVT VT = N->getValueType(0);
17117 // The following checks bail if this is not a halving operation.
17119 ElementCount ResEC = VT.getVectorElementCount();
17121 if (InVT.getVectorElementCount() != (ResEC * 2))
17124 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
17128 unsigned Index = CIndex->getZExtValue();
17129 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
17132 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
17133 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
17135 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
17136 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
17139 // Create an even/odd pair of X registers holding integer value V.
17140 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
17141 SDLoc dl(V.getNode());
17142 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
17143 SDValue VHi = DAG.getAnyExtOrTrunc(
17144 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
17146 if (DAG.getDataLayout().isBigEndian())
17147 std::swap (VLo, VHi);
17149 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
17150 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
17151 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
17152 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
17154 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
17157 static void ReplaceCMP_SWAP_128Results(SDNode *N,
17158 SmallVectorImpl<SDValue> &Results,
17160 const AArch64Subtarget *Subtarget) {
17161 assert(N->getValueType(0) == MVT::i128 &&
17162 "AtomicCmpSwap on types less than 128 should be legal");
17164 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
17165 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
17166 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
17167 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
17169 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
17170 createGPRPairNode(DAG, N->getOperand(3)), // Store value
17171 N->getOperand(1), // Ptr
17172 N->getOperand(0), // Chain in
17176 switch (MemOp->getMergedOrdering()) {
17177 case AtomicOrdering::Monotonic:
17178 Opcode = AArch64::CASPX;
17180 case AtomicOrdering::Acquire:
17181 Opcode = AArch64::CASPAX;
17183 case AtomicOrdering::Release:
17184 Opcode = AArch64::CASPLX;
17186 case AtomicOrdering::AcquireRelease:
17187 case AtomicOrdering::SequentiallyConsistent:
17188 Opcode = AArch64::CASPALX;
17191 llvm_unreachable("Unexpected ordering!");
17194 MachineSDNode *CmpSwap = DAG.getMachineNode(
17195 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
17196 DAG.setNodeMemRefs(CmpSwap, {MemOp});
17198 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
17199 if (DAG.getDataLayout().isBigEndian())
17200 std::swap(SubReg1, SubReg2);
17201 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
17202 SDValue(CmpSwap, 0));
17203 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
17204 SDValue(CmpSwap, 0));
17206 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
17207 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
17212 switch (MemOp->getMergedOrdering()) {
17213 case AtomicOrdering::Monotonic:
17214 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
17216 case AtomicOrdering::Acquire:
17217 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
17219 case AtomicOrdering::Release:
17220 Opcode = AArch64::CMP_SWAP_128_RELEASE;
17222 case AtomicOrdering::AcquireRelease:
17223 case AtomicOrdering::SequentiallyConsistent:
17224 Opcode = AArch64::CMP_SWAP_128;
17227 llvm_unreachable("Unexpected ordering!");
17230 auto Desired = splitInt128(N->getOperand(2), DAG);
17231 auto New = splitInt128(N->getOperand(3), DAG);
17232 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
17233 New.first, New.second, N->getOperand(0)};
17234 SDNode *CmpSwap = DAG.getMachineNode(
17235 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
17237 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
17239 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
17240 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
17241 Results.push_back(SDValue(CmpSwap, 3));
17244 void AArch64TargetLowering::ReplaceNodeResults(
17245 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
17246 switch (N->getOpcode()) {
17248 llvm_unreachable("Don't know how to custom expand this");
17250 ReplaceBITCASTResults(N, Results, DAG);
17252 case ISD::VECREDUCE_ADD:
17253 case ISD::VECREDUCE_SMAX:
17254 case ISD::VECREDUCE_SMIN:
17255 case ISD::VECREDUCE_UMAX:
17256 case ISD::VECREDUCE_UMIN:
17257 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
17261 if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
17262 Results.push_back(Result);
17264 case AArch64ISD::SADDV:
17265 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
17267 case AArch64ISD::UADDV:
17268 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
17270 case AArch64ISD::SMINV:
17271 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
17273 case AArch64ISD::UMINV:
17274 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
17276 case AArch64ISD::SMAXV:
17277 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
17279 case AArch64ISD::UMAXV:
17280 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
17282 case ISD::FP_TO_UINT:
17283 case ISD::FP_TO_SINT:
17284 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
17285 // Let normal code take care of it by not adding anything to Results.
17287 case ISD::ATOMIC_CMP_SWAP:
17288 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
17291 assert(SDValue(N, 0).getValueType() == MVT::i128 &&
17292 "unexpected load's value type");
17293 LoadSDNode *LoadNode = cast<LoadSDNode>(N);
17294 if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
17295 // Non-volatile loads are optimized later in AArch64's load/store
17300 SDValue Result = DAG.getMemIntrinsicNode(
17301 AArch64ISD::LDP, SDLoc(N),
17302 DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
17303 {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
17304 LoadNode->getMemOperand());
17306 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
17307 Result.getValue(0), Result.getValue(1));
17308 Results.append({Pair, Result.getValue(2) /* Chain */});
17311 case ISD::EXTRACT_SUBVECTOR:
17312 ReplaceExtractSubVectorResults(N, Results, DAG);
17314 case ISD::INSERT_SUBVECTOR:
17315 // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate
17316 // to common code for result type legalisation
17318 case ISD::INTRINSIC_WO_CHAIN: {
17319 EVT VT = N->getValueType(0);
17320 assert((VT == MVT::i8 || VT == MVT::i16) &&
17321 "custom lowering for unexpected type");
17323 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
17324 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
17328 case Intrinsic::aarch64_sve_clasta_n: {
17330 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
17331 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
17332 N->getOperand(1), Op2, N->getOperand(3));
17333 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17336 case Intrinsic::aarch64_sve_clastb_n: {
17338 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
17339 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
17340 N->getOperand(1), Op2, N->getOperand(3));
17341 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17344 case Intrinsic::aarch64_sve_lasta: {
17346 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
17347 N->getOperand(1), N->getOperand(2));
17348 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17351 case Intrinsic::aarch64_sve_lastb: {
17353 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
17354 N->getOperand(1), N->getOperand(2));
17355 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17363 bool AArch64TargetLowering::useLoadStackGuardNode() const {
17364 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
17365 return TargetLowering::useLoadStackGuardNode();
17369 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
17370 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
17371 // reciprocal if there are three or more FDIVs.
17375 TargetLoweringBase::LegalizeTypeAction
17376 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
17377 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
17378 // v4i16, v2i32 instead of to promote.
17379 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
17381 return TypeWidenVector;
17383 return TargetLoweringBase::getPreferredVectorAction(VT);
17386 // Loads and stores less than 128-bits are already atomic; ones above that
17387 // are doomed anyway, so defer to the default libcall and blame the OS when
17388 // things go wrong.
17389 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
17390 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
17391 return Size == 128;
17394 // Loads and stores less than 128-bits are already atomic; ones above that
17395 // are doomed anyway, so defer to the default libcall and blame the OS when
17396 // things go wrong.
17397 TargetLowering::AtomicExpansionKind
17398 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
17399 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
17400 return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
17403 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
17404 TargetLowering::AtomicExpansionKind
17405 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
17406 if (AI->isFloatingPointOperation())
17407 return AtomicExpansionKind::CmpXChg;
17409 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
17410 if (Size > 128) return AtomicExpansionKind::None;
17412 // Nand is not supported in LSE.
17413 // Leave 128 bits to LLSC or CmpXChg.
17414 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
17415 if (Subtarget->hasLSE())
17416 return AtomicExpansionKind::None;
17417 if (Subtarget->outlineAtomics()) {
17418 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
17419 // Don't outline them unless
17420 // (1) high level <atomic> support approved:
17421 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
17422 // (2) low level libgcc and compiler-rt support implemented by:
17423 // min/max outline atomics helpers
17424 if (AI->getOperation() != AtomicRMWInst::Min &&
17425 AI->getOperation() != AtomicRMWInst::Max &&
17426 AI->getOperation() != AtomicRMWInst::UMin &&
17427 AI->getOperation() != AtomicRMWInst::UMax) {
17428 return AtomicExpansionKind::None;
17433 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
17434 // implement atomicrmw without spilling. If the target address is also on the
17435 // stack and close enough to the spill slot, this can lead to a situation
17436 // where the monitor always gets cleared and the atomic operation can never
17437 // succeed. So at -O0 lower this operation to a CAS loop.
17438 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
17439 return AtomicExpansionKind::CmpXChg;
17441 return AtomicExpansionKind::LLSC;
17444 TargetLowering::AtomicExpansionKind
17445 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
17446 AtomicCmpXchgInst *AI) const {
17447 // If subtarget has LSE, leave cmpxchg intact for codegen.
17448 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
17449 return AtomicExpansionKind::None;
17450 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
17451 // implement cmpxchg without spilling. If the address being exchanged is also
17452 // on the stack and close enough to the spill slot, this can lead to a
17453 // situation where the monitor always gets cleared and the atomic operation
17454 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
17455 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
17456 return AtomicExpansionKind::None;
17458 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
17460 unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
17462 return AtomicExpansionKind::None;
17464 return AtomicExpansionKind::LLSC;
17467 Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
17468 Type *ValueTy, Value *Addr,
17469 AtomicOrdering Ord) const {
17470 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17471 bool IsAcquire = isAcquireOrStronger(Ord);
17473 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
17474 // intrinsic must return {i64, i64} and we have to recombine them into a
17475 // single i128 here.
17476 if (ValueTy->getPrimitiveSizeInBits() == 128) {
17477 Intrinsic::ID Int =
17478 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
17479 Function *Ldxr = Intrinsic::getDeclaration(M, Int);
17481 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
17482 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
17484 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
17485 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
17486 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
17487 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
17488 return Builder.CreateOr(
17489 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
17492 Type *Tys[] = { Addr->getType() };
17493 Intrinsic::ID Int =
17494 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
17495 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
17497 const DataLayout &DL = M->getDataLayout();
17498 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
17499 Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
17501 return Builder.CreateBitCast(Trunc, ValueTy);
17504 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
17505 IRBuilderBase &Builder) const {
17506 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17507 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
17510 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
17511 Value *Val, Value *Addr,
17512 AtomicOrdering Ord) const {
17513 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17514 bool IsRelease = isReleaseOrStronger(Ord);
17516 // Since the intrinsics must have legal type, the i128 intrinsics take two
17517 // parameters: "i64, i64". We must marshal Val into the appropriate form
17518 // before the call.
17519 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
17520 Intrinsic::ID Int =
17521 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
17522 Function *Stxr = Intrinsic::getDeclaration(M, Int);
17523 Type *Int64Ty = Type::getInt64Ty(M->getContext());
17525 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
17526 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
17527 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
17528 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
17531 Intrinsic::ID Int =
17532 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
17533 Type *Tys[] = { Addr->getType() };
17534 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
17536 const DataLayout &DL = M->getDataLayout();
17537 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
17538 Val = Builder.CreateBitCast(Val, IntValTy);
17540 return Builder.CreateCall(Stxr,
17541 {Builder.CreateZExtOrBitCast(
17542 Val, Stxr->getFunctionType()->getParamType(0)),
17546 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
17547 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
17548 const DataLayout &DL) const {
17549 if (!Ty->isArrayTy()) {
17550 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
17551 return TySize.isScalable() && TySize.getKnownMinSize() > 128;
17554 // All non aggregate members of the type must have the same type
17555 SmallVector<EVT> ValueVTs;
17556 ComputeValueVTs(*this, DL, Ty, ValueVTs);
17557 return is_splat(ValueVTs);
17560 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
17565 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
17566 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
17567 Function *ThreadPointerFunc =
17568 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
17569 return IRB.CreatePointerCast(
17570 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
17572 IRB.getInt8PtrTy()->getPointerTo(0));
17575 Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
17576 // Android provides a fixed TLS slot for the stack cookie. See the definition
17577 // of TLS_SLOT_STACK_GUARD in
17578 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
17579 if (Subtarget->isTargetAndroid())
17580 return UseTlsOffset(IRB, 0x28);
17582 // Fuchsia is similar.
17583 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
17584 if (Subtarget->isTargetFuchsia())
17585 return UseTlsOffset(IRB, -0x10);
17587 return TargetLowering::getIRStackGuard(IRB);
17590 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
17591 // MSVC CRT provides functionalities for stack protection.
17592 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
17593 // MSVC CRT has a global variable holding security cookie.
17594 M.getOrInsertGlobal("__security_cookie",
17595 Type::getInt8PtrTy(M.getContext()));
17597 // MSVC CRT has a function to validate security cookie.
17598 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
17599 "__security_check_cookie", Type::getVoidTy(M.getContext()),
17600 Type::getInt8PtrTy(M.getContext()));
17601 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
17602 F->setCallingConv(CallingConv::Win64);
17603 F->addAttribute(1, Attribute::AttrKind::InReg);
17607 TargetLowering::insertSSPDeclarations(M);
17610 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
17611 // MSVC CRT has a global variable holding security cookie.
17612 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
17613 return M.getGlobalVariable("__security_cookie");
17614 return TargetLowering::getSDagStackGuard(M);
17617 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
17618 // MSVC CRT has a function to validate security cookie.
17619 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
17620 return M.getFunction("__security_check_cookie");
17621 return TargetLowering::getSSPStackGuardCheck(M);
17625 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
17626 // Android provides a fixed TLS slot for the SafeStack pointer. See the
17627 // definition of TLS_SLOT_SAFESTACK in
17628 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
17629 if (Subtarget->isTargetAndroid())
17630 return UseTlsOffset(IRB, 0x48);
17632 // Fuchsia is similar.
17633 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
17634 if (Subtarget->isTargetFuchsia())
17635 return UseTlsOffset(IRB, -0x8);
17637 return TargetLowering::getSafeStackPointerLocation(IRB);
17640 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
17641 const Instruction &AndI) const {
17642 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
17643 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
17644 // may be beneficial to sink in other cases, but we would have to check that
17645 // the cmp would not get folded into the br to form a cbz for these to be
17647 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
17650 return Mask->getValue().isPowerOf2();
17653 bool AArch64TargetLowering::
17654 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
17655 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
17656 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
17657 SelectionDAG &DAG) const {
17658 // Does baseline recommend not to perform the fold by default?
17659 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
17660 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
17662 // Else, if this is a vector shift, prefer 'shl'.
17663 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
17666 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
17668 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
17669 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
17674 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
17675 // Update IsSplitCSR in AArch64unctionInfo.
17676 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
17677 AFI->setIsSplitCSR(true);
17680 void AArch64TargetLowering::insertCopiesSplitCSR(
17681 MachineBasicBlock *Entry,
17682 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
17683 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
17684 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
17688 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
17689 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
17690 MachineBasicBlock::iterator MBBI = Entry->begin();
17691 for (const MCPhysReg *I = IStart; *I; ++I) {
17692 const TargetRegisterClass *RC = nullptr;
17693 if (AArch64::GPR64RegClass.contains(*I))
17694 RC = &AArch64::GPR64RegClass;
17695 else if (AArch64::FPR64RegClass.contains(*I))
17696 RC = &AArch64::FPR64RegClass;
17698 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
17700 Register NewVR = MRI->createVirtualRegister(RC);
17701 // Create copy from CSR to a virtual register.
17702 // FIXME: this currently does not emit CFI pseudo-instructions, it works
17703 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
17704 // nounwind. If we want to generalize this later, we may need to emit
17705 // CFI pseudo-instructions.
17706 assert(Entry->getParent()->getFunction().hasFnAttribute(
17707 Attribute::NoUnwind) &&
17708 "Function should be nounwind in insertCopiesSplitCSR!");
17709 Entry->addLiveIn(*I);
17710 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
17713 // Insert the copy-back instructions right before the terminator.
17714 for (auto *Exit : Exits)
17715 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
17716 TII->get(TargetOpcode::COPY), *I)
17721 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
17722 // Integer division on AArch64 is expensive. However, when aggressively
17723 // optimizing for code size, we prefer to use a div instruction, as it is
17724 // usually smaller than the alternative sequence.
17725 // The exception to this is vector division. Since AArch64 doesn't have vector
17726 // integer division, leaving the division as-is is a loss even in terms of
17727 // size, because it will have to be scalarized, while the alternative code
17728 // sequence can be performed in vector form.
17729 bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
17730 return OptSize && !VT.isVector();
17733 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
17734 // We want inc-of-add for scalars and sub-of-not for vectors.
17735 return VT.isScalarInteger();
17738 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
17739 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
17743 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
17744 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
17745 return getPointerTy(DL).getSizeInBits();
17747 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
17750 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
17751 MF.getFrameInfo().computeMaxCallFrameSize(MF);
17752 TargetLoweringBase::finalizeLowering(MF);
17755 // Unlike X86, we let frame lowering assign offsets to all catch objects.
17756 bool AArch64TargetLowering::needsFixedCatchObjects() const {
17760 bool AArch64TargetLowering::shouldLocalize(
17761 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
17762 switch (MI.getOpcode()) {
17763 case TargetOpcode::G_GLOBAL_VALUE: {
17764 // On Darwin, TLS global vars get selected into function calls, which
17765 // we don't want localized, as they can get moved into the middle of a
17766 // another call sequence.
17767 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
17768 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
17772 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
17774 case AArch64::ADRP:
17775 case AArch64::G_ADD_LOW:
17780 return TargetLoweringBase::shouldLocalize(MI, TTI);
17783 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
17784 if (isa<ScalableVectorType>(Inst.getType()))
17787 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
17788 if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
17791 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
17792 if (isa<ScalableVectorType>(AI->getAllocatedType()))
17799 // Return the largest legal scalable vector type that matches VT's element type.
17800 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
17801 assert(VT.isFixedLengthVector() &&
17802 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
17803 "Expected legal fixed length vector!");
17804 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
17806 llvm_unreachable("unexpected element type for SVE container");
17808 return EVT(MVT::nxv16i8);
17810 return EVT(MVT::nxv8i16);
17812 return EVT(MVT::nxv4i32);
17814 return EVT(MVT::nxv2i64);
17816 return EVT(MVT::nxv8f16);
17818 return EVT(MVT::nxv4f32);
17820 return EVT(MVT::nxv2f64);
17824 // Return a PTRUE with active lanes corresponding to the extent of VT.
17825 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
17827 assert(VT.isFixedLengthVector() &&
17828 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
17829 "Expected legal fixed length vector!");
17832 switch (VT.getVectorNumElements()) {
17834 llvm_unreachable("unexpected element count for SVE predicate");
17836 PgPattern = AArch64SVEPredPattern::vl1;
17839 PgPattern = AArch64SVEPredPattern::vl2;
17842 PgPattern = AArch64SVEPredPattern::vl4;
17845 PgPattern = AArch64SVEPredPattern::vl8;
17848 PgPattern = AArch64SVEPredPattern::vl16;
17851 PgPattern = AArch64SVEPredPattern::vl32;
17854 PgPattern = AArch64SVEPredPattern::vl64;
17857 PgPattern = AArch64SVEPredPattern::vl128;
17860 PgPattern = AArch64SVEPredPattern::vl256;
17864 // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
17865 // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
17866 // variants of instructions when available.
17869 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
17871 llvm_unreachable("unexpected element type for SVE predicate");
17873 MaskVT = MVT::nxv16i1;
17877 MaskVT = MVT::nxv8i1;
17881 MaskVT = MVT::nxv4i1;
17885 MaskVT = MVT::nxv2i1;
17889 return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
17890 DAG.getTargetConstant(PgPattern, DL, MVT::i64));
17893 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
17895 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
17896 "Expected legal scalable vector!");
17897 auto PredTy = VT.changeVectorElementType(MVT::i1);
17898 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
17901 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
17902 if (VT.isFixedLengthVector())
17903 return getPredicateForFixedLengthVector(DAG, DL, VT);
17905 return getPredicateForScalableVector(DAG, DL, VT);
17908 // Grow V to consume an entire SVE register.
17909 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
17910 assert(VT.isScalableVector() &&
17911 "Expected to convert into a scalable vector!");
17912 assert(V.getValueType().isFixedLengthVector() &&
17913 "Expected a fixed length vector operand!");
17915 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17916 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
17919 // Shrink V so it's just big enough to maintain a VT's worth of data.
17920 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
17921 assert(VT.isFixedLengthVector() &&
17922 "Expected to convert into a fixed length vector!");
17923 assert(V.getValueType().isScalableVector() &&
17924 "Expected a scalable vector operand!");
17926 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17927 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
17930 // Convert all fixed length vector loads larger than NEON to masked_loads.
17931 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
17932 SDValue Op, SelectionDAG &DAG) const {
17933 auto Load = cast<LoadSDNode>(Op);
17936 EVT VT = Op.getValueType();
17937 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
17939 auto NewLoad = DAG.getMaskedLoad(
17940 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
17941 getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
17942 Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
17943 Load->getExtensionType());
17945 auto Result = convertFromScalableVector(DAG, VT, NewLoad);
17946 SDValue MergedValues[2] = {Result, Load->getChain()};
17947 return DAG.getMergeValues(MergedValues, DL);
17950 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
17951 SelectionDAG &DAG) {
17953 EVT InVT = Mask.getValueType();
17954 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
17956 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
17957 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
17958 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
17960 EVT CmpVT = Pg.getValueType();
17961 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
17962 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
17965 // Convert all fixed length vector loads larger than NEON to masked_loads.
17966 SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
17967 SDValue Op, SelectionDAG &DAG) const {
17968 auto Load = cast<MaskedLoadSDNode>(Op);
17970 if (Load->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD)
17974 EVT VT = Op.getValueType();
17975 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
17977 SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);
17980 bool IsPassThruZeroOrUndef = false;
17982 if (Load->getPassThru()->isUndef()) {
17983 PassThru = DAG.getUNDEF(ContainerVT);
17984 IsPassThruZeroOrUndef = true;
17986 if (ContainerVT.isInteger())
17987 PassThru = DAG.getConstant(0, DL, ContainerVT);
17989 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
17990 if (isZerosVector(Load->getPassThru().getNode()))
17991 IsPassThruZeroOrUndef = true;
17994 auto NewLoad = DAG.getMaskedLoad(
17995 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
17996 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
17997 Load->getAddressingMode(), Load->getExtensionType());
17999 if (!IsPassThruZeroOrUndef) {
18000 SDValue OldPassThru =
18001 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
18002 NewLoad = DAG.getSelect(DL, ContainerVT, Mask, NewLoad, OldPassThru);
18005 auto Result = convertFromScalableVector(DAG, VT, NewLoad);
18006 SDValue MergedValues[2] = {Result, Load->getChain()};
18007 return DAG.getMergeValues(MergedValues, DL);
18010 // Convert all fixed length vector stores larger than NEON to masked_stores.
18011 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
18012 SDValue Op, SelectionDAG &DAG) const {
18013 auto Store = cast<StoreSDNode>(Op);
18016 EVT VT = Store->getValue().getValueType();
18017 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18019 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
18020 return DAG.getMaskedStore(
18021 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
18022 getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
18023 Store->getMemOperand(), Store->getAddressingMode(),
18024 Store->isTruncatingStore());
18027 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
18028 SDValue Op, SelectionDAG &DAG) const {
18029 auto Store = cast<MaskedStoreSDNode>(Op);
18031 if (Store->isTruncatingStore())
18035 EVT VT = Store->getValue().getValueType();
18036 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18038 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
18039 SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
18041 return DAG.getMaskedStore(
18042 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
18043 Mask, Store->getMemoryVT(), Store->getMemOperand(),
18044 Store->getAddressingMode(), Store->isTruncatingStore());
18047 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
18048 SDValue Op, SelectionDAG &DAG) const {
18050 EVT VT = Op.getValueType();
18051 EVT EltVT = VT.getVectorElementType();
18053 bool Signed = Op.getOpcode() == ISD::SDIV;
18054 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
18056 // Scalable vector i32/i64 DIV is supported.
18057 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18058 return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
18060 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
18061 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18062 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18063 EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
18064 EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
18066 // If this is not a full vector, extend, div, and truncate it.
18067 EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext());
18068 if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) {
18069 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
18070 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0));
18071 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1));
18072 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1);
18073 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
18076 // Convert the operands to scalable vectors.
18077 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
18078 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
18080 // Extend the scalable operands.
18081 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
18082 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
18083 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
18084 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
18085 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
18086 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
18088 // Convert back to fixed vectors so the DIV can be further lowered.
18089 Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
18090 Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
18091 Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
18092 Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
18093 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
18095 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
18098 // Convert again to scalable vectors to truncate.
18099 ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
18100 ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
18101 SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
18102 ResultLo, ResultHi);
18104 return convertFromScalableVector(DAG, VT, ScalableResult);
18107 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
18108 SDValue Op, SelectionDAG &DAG) const {
18109 EVT VT = Op.getValueType();
18110 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18113 SDValue Val = Op.getOperand(0);
18114 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
18115 Val = convertToScalableVector(DAG, ContainerVT, Val);
18117 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
18118 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
18120 // Repeatedly unpack Val until the result is of the desired element type.
18121 switch (ContainerVT.getSimpleVT().SimpleTy) {
18123 llvm_unreachable("unimplemented container type");
18125 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
18126 if (VT.getVectorElementType() == MVT::i16)
18130 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
18131 if (VT.getVectorElementType() == MVT::i32)
18135 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
18136 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
18140 return convertFromScalableVector(DAG, VT, Val);
18143 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
18144 SDValue Op, SelectionDAG &DAG) const {
18145 EVT VT = Op.getValueType();
18146 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18149 SDValue Val = Op.getOperand(0);
18150 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
18151 Val = convertToScalableVector(DAG, ContainerVT, Val);
18153 // Repeatedly truncate Val until the result is of the desired element type.
18154 switch (ContainerVT.getSimpleVT().SimpleTy) {
18156 llvm_unreachable("unimplemented container type");
18158 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
18159 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
18160 if (VT.getVectorElementType() == MVT::i32)
18164 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
18165 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
18166 if (VT.getVectorElementType() == MVT::i16)
18170 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
18171 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
18172 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
18176 return convertFromScalableVector(DAG, VT, Val);
18179 SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
18180 SDValue Op, SelectionDAG &DAG) const {
18181 EVT VT = Op.getValueType();
18182 EVT InVT = Op.getOperand(0).getValueType();
18183 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
18186 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
18187 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
18189 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
18192 SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
18193 SDValue Op, SelectionDAG &DAG) const {
18194 EVT VT = Op.getValueType();
18195 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18198 EVT InVT = Op.getOperand(0).getValueType();
18199 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
18200 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
18202 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
18203 Op.getOperand(1), Op.getOperand(2));
18205 return convertFromScalableVector(DAG, VT, ScalableRes);
18208 // Convert vector operation 'Op' to an equivalent predicated operation whereby
18209 // the original operation's type is used to construct a suitable predicate.
18210 // NOTE: The results for inactive lanes are undefined.
18211 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
18214 bool OverrideNEON) const {
18215 EVT VT = Op.getValueType();
18217 auto Pg = getPredicateForVector(DAG, DL, VT);
18219 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
18220 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18222 // Create list of operands by converting existing ones to scalable types.
18223 SmallVector<SDValue, 4> Operands = {Pg};
18224 for (const SDValue &V : Op->op_values()) {
18225 if (isa<CondCodeSDNode>(V)) {
18226 Operands.push_back(V);
18230 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
18231 EVT VTArg = VTNode->getVT().getVectorElementType();
18232 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
18233 Operands.push_back(DAG.getValueType(NewVTArg));
18237 assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
18238 "Only fixed length vectors are supported!");
18239 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
18242 if (isMergePassthruOpcode(NewOp))
18243 Operands.push_back(DAG.getUNDEF(ContainerVT));
18245 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
18246 return convertFromScalableVector(DAG, VT, ScalableRes);
18249 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
18251 SmallVector<SDValue, 4> Operands = {Pg};
18252 for (const SDValue &V : Op->op_values()) {
18253 assert((!V.getValueType().isVector() ||
18254 V.getValueType().isScalableVector()) &&
18255 "Only scalable vectors are supported!");
18256 Operands.push_back(V);
18259 if (isMergePassthruOpcode(NewOp))
18260 Operands.push_back(DAG.getUNDEF(VT));
18262 return DAG.getNode(NewOp, DL, VT, Operands);
18265 // If a fixed length vector operation has no side effects when applied to
18266 // undefined elements, we can safely use scalable vectors to perform the same
18267 // operation without needing to worry about predication.
18268 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
18269 SelectionDAG &DAG) const {
18270 EVT VT = Op.getValueType();
18271 assert(useSVEForFixedLengthVectorVT(VT) &&
18272 "Only expected to lower fixed length vector operation!");
18273 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18275 // Create list of operands by converting existing ones to scalable types.
18276 SmallVector<SDValue, 4> Ops;
18277 for (const SDValue &V : Op->op_values()) {
18278 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
18280 // Pass through non-vector operands.
18281 if (!V.getValueType().isVector()) {
18286 // "cast" fixed length vector to a scalable vector.
18287 assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
18288 "Only fixed length vectors are supported!");
18289 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
18292 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
18293 return convertFromScalableVector(DAG, VT, ScalableRes);
18296 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
18297 SelectionDAG &DAG) const {
18298 SDLoc DL(ScalarOp);
18299 SDValue AccOp = ScalarOp.getOperand(0);
18300 SDValue VecOp = ScalarOp.getOperand(1);
18301 EVT SrcVT = VecOp.getValueType();
18302 EVT ResVT = SrcVT.getVectorElementType();
18304 EVT ContainerVT = SrcVT;
18305 if (SrcVT.isFixedLengthVector()) {
18306 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
18307 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
18310 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
18311 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
18313 // Convert operands to Scalable.
18314 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
18315 DAG.getUNDEF(ContainerVT), AccOp, Zero);
18317 // Perform reduction.
18318 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
18321 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
18324 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
18325 SelectionDAG &DAG) const {
18326 SDLoc DL(ReduceOp);
18327 SDValue Op = ReduceOp.getOperand(0);
18328 EVT OpVT = Op.getValueType();
18329 EVT VT = ReduceOp.getValueType();
18331 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18334 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
18336 switch (ReduceOp.getOpcode()) {
18339 case ISD::VECREDUCE_OR:
18340 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
18341 case ISD::VECREDUCE_AND: {
18342 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
18343 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
18345 case ISD::VECREDUCE_XOR: {
18347 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
18349 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
18350 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
18357 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
18359 SelectionDAG &DAG) const {
18360 SDLoc DL(ScalarOp);
18361 SDValue VecOp = ScalarOp.getOperand(0);
18362 EVT SrcVT = VecOp.getValueType();
18364 if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
18365 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
18366 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
18369 // UADDV always returns an i64 result.
18370 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
18371 SrcVT.getVectorElementType();
18373 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
18374 RdxVT = getPackedSVEVectorVT(ResVT);
18376 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
18377 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
18378 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
18379 Rdx, DAG.getConstant(0, DL, MVT::i64));
18381 // The VEC_REDUCE nodes expect an element size result.
18382 if (ResVT != ScalarOp.getValueType())
18383 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
18389 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
18390 SelectionDAG &DAG) const {
18391 EVT VT = Op.getValueType();
18394 EVT InVT = Op.getOperand(1).getValueType();
18395 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
18396 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
18397 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
18399 // Convert the mask to a predicated (NOTE: We don't need to worry about
18400 // inactive lanes since VSELECT is safe when given undefined elements).
18401 EVT MaskVT = Op.getOperand(0).getValueType();
18402 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
18403 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
18404 Mask = DAG.getNode(ISD::TRUNCATE, DL,
18405 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
18407 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
18410 return convertFromScalableVector(DAG, VT, ScalableRes);
18413 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
18414 SDValue Op, SelectionDAG &DAG) const {
18416 EVT InVT = Op.getOperand(0).getValueType();
18417 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
18419 assert(useSVEForFixedLengthVectorVT(InVT) &&
18420 "Only expected to lower fixed length vector operation!");
18421 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
18422 "Expected integer result of the same bit length as the inputs!");
18424 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
18425 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
18426 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
18428 EVT CmpVT = Pg.getValueType();
18429 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
18430 {Pg, Op1, Op2, Op.getOperand(2)});
18432 EVT PromoteVT = ContainerVT.changeTypeToInteger();
18433 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
18434 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
18438 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
18439 SelectionDAG &DAG) const {
18441 auto SrcOp = Op.getOperand(0);
18442 EVT VT = Op.getValueType();
18443 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
18444 EVT ContainerSrcVT =
18445 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
18447 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
18448 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
18449 return convertFromScalableVector(DAG, VT, Op);
18452 SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
18453 SDValue Op, SelectionDAG &DAG) const {
18455 unsigned NumOperands = Op->getNumOperands();
18457 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
18458 "Unexpected number of operands in CONCAT_VECTORS");
18460 auto SrcOp1 = Op.getOperand(0);
18461 auto SrcOp2 = Op.getOperand(1);
18462 EVT VT = Op.getValueType();
18463 EVT SrcVT = SrcOp1.getValueType();
18465 if (NumOperands > 2) {
18466 SmallVector<SDValue, 4> Ops;
18467 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
18468 for (unsigned I = 0; I < NumOperands; I += 2)
18469 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
18470 Op->getOperand(I), Op->getOperand(I + 1)));
18472 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
18475 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18477 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
18478 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
18479 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
18481 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
18483 return convertFromScalableVector(DAG, VT, Op);
18487 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
18488 SelectionDAG &DAG) const {
18489 EVT VT = Op.getValueType();
18490 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18493 SDValue Val = Op.getOperand(0);
18494 SDValue Pg = getPredicateForVector(DAG, DL, VT);
18495 EVT SrcVT = Val.getValueType();
18496 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18497 EVT ExtendVT = ContainerVT.changeVectorElementType(
18498 SrcVT.getVectorElementType());
18500 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
18501 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
18503 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
18504 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
18505 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
18506 Pg, Val, DAG.getUNDEF(ContainerVT));
18508 return convertFromScalableVector(DAG, VT, Val);
18512 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
18513 SelectionDAG &DAG) const {
18514 EVT VT = Op.getValueType();
18515 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18518 SDValue Val = Op.getOperand(0);
18519 EVT SrcVT = Val.getValueType();
18520 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
18521 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
18522 VT.getVectorElementType());
18523 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
18525 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18526 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
18527 Op.getOperand(1), DAG.getUNDEF(RoundVT));
18528 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
18529 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
18531 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
18532 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
18536 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
18537 SelectionDAG &DAG) const {
18538 EVT VT = Op.getValueType();
18539 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18541 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
18542 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
18543 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
18546 SDValue Val = Op.getOperand(0);
18547 EVT SrcVT = Val.getValueType();
18548 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
18549 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
18551 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
18552 ContainerDstVT.getVectorElementType().getSizeInBits()) {
18553 SDValue Pg = getPredicateForVector(DAG, DL, VT);
18555 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
18556 VT.changeTypeToInteger(), Val);
18558 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18559 Val = getSVESafeBitCast(ContainerDstVT.changeTypeToInteger(), Val, DAG);
18560 // Safe to use a larger than specified operand since we just unpacked the
18561 // data, hence the upper bits are zero.
18562 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
18563 DAG.getUNDEF(ContainerDstVT));
18564 return convertFromScalableVector(DAG, VT, Val);
18566 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
18567 ContainerDstVT.getVectorElementType());
18568 SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
18570 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18571 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
18572 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
18573 Val = convertFromScalableVector(DAG, SrcVT, Val);
18575 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
18576 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
18581 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
18582 SelectionDAG &DAG) const {
18583 EVT VT = Op.getValueType();
18584 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18586 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
18587 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
18588 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
18591 SDValue Val = Op.getOperand(0);
18592 EVT SrcVT = Val.getValueType();
18593 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
18594 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
18596 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
18597 ContainerDstVT.getVectorElementType().getSizeInBits()) {
18598 EVT CvtVT = ContainerDstVT.changeVectorElementType(
18599 ContainerSrcVT.getVectorElementType());
18600 SDValue Pg = getPredicateForVector(DAG, DL, VT);
18602 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
18603 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
18605 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18606 Val = getSVESafeBitCast(CvtVT, Val, DAG);
18607 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
18608 DAG.getUNDEF(ContainerDstVT));
18609 return convertFromScalableVector(DAG, VT, Val);
18611 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
18612 SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
18614 // Safe to use a larger than specified result since an fp_to_int where the
18615 // result doesn't fit into the destination is undefined.
18616 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18617 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
18618 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
18620 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
18624 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
18625 SDValue Op, SelectionDAG &DAG) const {
18626 EVT VT = Op.getValueType();
18627 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18629 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
18630 auto ShuffleMask = SVN->getMask();
18633 SDValue Op1 = Op.getOperand(0);
18634 SDValue Op2 = Op.getOperand(1);
18636 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18637 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
18638 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
18640 bool ReverseEXT = false;
18642 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
18643 Imm == VT.getVectorNumElements() - 1) {
18645 std::swap(Op1, Op2);
18647 EVT ScalarTy = VT.getVectorElementType();
18648 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
18649 ScalarTy = MVT::i32;
18650 SDValue Scalar = DAG.getNode(
18651 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
18652 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
18653 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
18654 return convertFromScalableVector(DAG, VT, Op);
18660 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
18661 SelectionDAG &DAG) const {
18663 EVT InVT = Op.getValueType();
18664 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18667 assert(VT.isScalableVector() && TLI.isTypeLegal(VT) &&
18668 InVT.isScalableVector() && TLI.isTypeLegal(InVT) &&
18669 "Only expect to cast between legal scalable vector types!");
18670 assert((VT.getVectorElementType() == MVT::i1) ==
18671 (InVT.getVectorElementType() == MVT::i1) &&
18672 "Cannot cast between data and predicate scalable vector types!");
18677 if (VT.getVectorElementType() == MVT::i1)
18678 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
18680 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
18681 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
18683 // Pack input if required.
18684 if (InVT != PackedInVT)
18685 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
18687 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
18689 // Unpack result if required.
18690 if (VT != PackedVT)
18691 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
18696 bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const {
18697 return ::isAllActivePredicate(N);
18700 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
18701 return ::getPromotedVTForPredicate(VT);
18704 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
18705 SDValue Op, const APInt &OriginalDemandedBits,
18706 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
18707 unsigned Depth) const {
18709 unsigned Opc = Op.getOpcode();
18711 case AArch64ISD::VSHL: {
18712 // Match (VSHL (VLSHR Val X) X)
18713 SDValue ShiftL = Op;
18714 SDValue ShiftR = Op->getOperand(0);
18715 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
18718 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
18721 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
18722 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
18724 // Other cases can be handled as well, but this is not
18726 if (ShiftRBits != ShiftLBits)
18729 unsigned ScalarSize = Op.getScalarValueSizeInBits();
18730 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
18732 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
18733 APInt UnusedBits = ~OriginalDemandedBits;
18735 if ((ZeroBits & UnusedBits) != ZeroBits)
18738 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
18739 // used - simplify to just Val.
18740 return TLO.CombineTo(Op, ShiftR->getOperand(0));
18744 return TargetLowering::SimplifyDemandedBitsForTargetNode(
18745 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
18748 bool AArch64TargetLowering::isConstantUnsignedBitfieldExtactLegal(
18749 unsigned Opc, LLT Ty1, LLT Ty2) const {
18750 return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));