1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
220 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
223 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
225 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
226 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
228 if (!Subtarget.useSoftFloat()) {
229 // SSE has no i16 to fp conversion, only i32.
230 if (X86ScalarSSEf32) {
231 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
232 // f32 and f64 cases are Legal, f80 case is not
233 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
235 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
236 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
239 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
240 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
243 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
245 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
246 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
248 if (!Subtarget.useSoftFloat()) {
249 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
250 // are Legal, f80 is custom lowered.
251 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
252 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
254 if (X86ScalarSSEf32) {
255 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
256 // f32 and f64 cases are Legal, f80 case is not
257 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
260 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
264 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
265 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
268 // Handle FP_TO_UINT by promoting the destination to a larger signed
270 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
271 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
272 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
274 if (Subtarget.is64Bit()) {
275 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
276 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
277 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
280 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
281 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
283 } else if (!Subtarget.useSoftFloat()) {
284 // Since AVX is a superset of SSE3, only check for SSE here.
285 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
286 // Expand FP_TO_UINT into a select.
287 // FIXME: We would like to use a Custom expander here eventually to do
288 // the optimal thing for SSE vs. the default expansion in the legalizer.
289 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
291 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
292 // With SSE3 we can use fisttpll to convert to a signed i64; without
293 // SSE, we're stuck with a fistpll.
294 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
296 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
299 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
300 if (!X86ScalarSSEf64) {
301 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
302 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
303 if (Subtarget.is64Bit()) {
304 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
305 // Without SSE, i64->f64 goes through memory.
306 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
308 } else if (!Subtarget.is64Bit())
309 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
311 // Scalar integer divide and remainder are lowered to use operations that
312 // produce two results, to match the available instructions. This exposes
313 // the two-result form to trivial CSE, which is able to combine x/y and x%y
314 // into a single instruction.
316 // Scalar integer multiply-high is also lowered to use two-result
317 // operations, to match the available instructions. However, plain multiply
318 // (low) operations are left as Legal, as there are single-result
319 // instructions for this in x86. Using the two-result multiply instructions
320 // when both high and low results are needed must be arranged by dagcombine.
321 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
322 setOperationAction(ISD::MULHS, VT, Expand);
323 setOperationAction(ISD::MULHU, VT, Expand);
324 setOperationAction(ISD::SDIV, VT, Expand);
325 setOperationAction(ISD::UDIV, VT, Expand);
326 setOperationAction(ISD::SREM, VT, Expand);
327 setOperationAction(ISD::UREM, VT, Expand);
330 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
331 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
332 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
333 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
334 setOperationAction(ISD::BR_CC, VT, Expand);
335 setOperationAction(ISD::SELECT_CC, VT, Expand);
337 if (Subtarget.is64Bit())
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
340 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
341 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
342 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
344 setOperationAction(ISD::FREM , MVT::f32 , Expand);
345 setOperationAction(ISD::FREM , MVT::f64 , Expand);
346 setOperationAction(ISD::FREM , MVT::f80 , Expand);
347 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
349 // Promote the i8 variants and force them on up to i32 which has a shorter
351 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
352 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
353 if (!Subtarget.hasBMI()) {
354 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
355 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
356 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
357 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
358 if (Subtarget.is64Bit()) {
359 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
360 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
364 if (Subtarget.hasLZCNT()) {
365 // When promoting the i8 variants, force them to i32 for a shorter
367 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
368 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
370 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
371 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
372 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
374 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
375 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
376 if (Subtarget.is64Bit()) {
377 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
378 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
382 // Special handling for half-precision floating point conversions.
383 // If we don't have F16C support, then lower half float conversions
384 // into library calls.
385 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
386 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
387 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
390 // There's never any support for operations beyond MVT::f32.
391 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
392 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
393 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
394 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
397 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
398 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
400 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
401 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
403 if (Subtarget.hasPOPCNT()) {
404 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
406 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
407 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
408 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
409 if (Subtarget.is64Bit())
410 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
413 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
415 if (!Subtarget.hasMOVBE())
416 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
418 // These should be promoted to a larger select which is supported.
419 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
420 // X86 wants to expand cmov itself.
421 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
422 setOperationAction(ISD::SELECT, VT, Custom);
423 setOperationAction(ISD::SETCC, VT, Custom);
425 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
426 if (VT == MVT::i64 && !Subtarget.is64Bit())
428 setOperationAction(ISD::SELECT, VT, Custom);
429 setOperationAction(ISD::SETCC, VT, Custom);
432 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
433 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
434 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
436 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
437 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
438 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
439 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
440 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
441 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
442 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
443 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
446 for (auto VT : { MVT::i32, MVT::i64 }) {
447 if (VT == MVT::i64 && !Subtarget.is64Bit())
449 setOperationAction(ISD::ConstantPool , VT, Custom);
450 setOperationAction(ISD::JumpTable , VT, Custom);
451 setOperationAction(ISD::GlobalAddress , VT, Custom);
452 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
453 setOperationAction(ISD::ExternalSymbol , VT, Custom);
454 setOperationAction(ISD::BlockAddress , VT, Custom);
457 // 64-bit shl, sra, srl (iff 32-bit x86)
458 for (auto VT : { MVT::i32, MVT::i64 }) {
459 if (VT == MVT::i64 && !Subtarget.is64Bit())
461 setOperationAction(ISD::SHL_PARTS, VT, Custom);
462 setOperationAction(ISD::SRA_PARTS, VT, Custom);
463 setOperationAction(ISD::SRL_PARTS, VT, Custom);
466 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
467 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
469 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
471 // Expand certain atomics
472 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
473 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
477 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
478 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
479 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
482 if (Subtarget.hasCmpxchg16b()) {
483 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
486 // FIXME - use subtarget debug flags
487 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
488 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
489 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
490 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
493 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
494 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
496 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
499 setOperationAction(ISD::TRAP, MVT::Other, Legal);
500 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
502 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
503 setOperationAction(ISD::VASTART , MVT::Other, Custom);
504 setOperationAction(ISD::VAEND , MVT::Other, Expand);
505 bool Is64Bit = Subtarget.is64Bit();
506 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
509 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
510 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
512 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
514 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
515 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
516 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
518 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
519 // f32 and f64 use SSE.
520 // Set up the FP register classes.
521 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
522 : &X86::FR32RegClass);
523 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
524 : &X86::FR64RegClass);
526 for (auto VT : { MVT::f32, MVT::f64 }) {
527 // Use ANDPD to simulate FABS.
528 setOperationAction(ISD::FABS, VT, Custom);
530 // Use XORP to simulate FNEG.
531 setOperationAction(ISD::FNEG, VT, Custom);
533 // Use ANDPD and ORPD to simulate FCOPYSIGN.
534 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
536 // We don't support sin/cos/fmod
537 setOperationAction(ISD::FSIN , VT, Expand);
538 setOperationAction(ISD::FCOS , VT, Expand);
539 setOperationAction(ISD::FSINCOS, VT, Expand);
542 // Lower this to MOVMSK plus an AND.
543 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
544 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
546 // Expand FP immediates into loads from the stack, except for the special
548 addLegalFPImmediate(APFloat(+0.0)); // xorpd
549 addLegalFPImmediate(APFloat(+0.0f)); // xorps
550 } else if (UseX87 && X86ScalarSSEf32) {
551 // Use SSE for f32, x87 for f64.
552 // Set up the FP register classes.
553 addRegisterClass(MVT::f32, &X86::FR32RegClass);
554 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
556 // Use ANDPS to simulate FABS.
557 setOperationAction(ISD::FABS , MVT::f32, Custom);
559 // Use XORP to simulate FNEG.
560 setOperationAction(ISD::FNEG , MVT::f32, Custom);
562 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
564 // Use ANDPS and ORPS to simulate FCOPYSIGN.
565 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
566 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
568 // We don't support sin/cos/fmod
569 setOperationAction(ISD::FSIN , MVT::f32, Expand);
570 setOperationAction(ISD::FCOS , MVT::f32, Expand);
571 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
573 // Special cases we handle for FP constants.
574 addLegalFPImmediate(APFloat(+0.0f)); // xorps
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
580 // Always expand sin/cos functions even though x87 has an instruction.
581 setOperationAction(ISD::FSIN , MVT::f64, Expand);
582 setOperationAction(ISD::FCOS , MVT::f64, Expand);
583 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
585 // f32 and f64 in x87.
586 // Set up the FP register classes.
587 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
588 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
590 for (auto VT : { MVT::f32, MVT::f64 }) {
591 setOperationAction(ISD::UNDEF, VT, Expand);
592 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
594 // Always expand sin/cos functions even though x87 has an instruction.
595 setOperationAction(ISD::FSIN , VT, Expand);
596 setOperationAction(ISD::FCOS , VT, Expand);
597 setOperationAction(ISD::FSINCOS, VT, Expand);
599 addLegalFPImmediate(APFloat(+0.0)); // FLD0
600 addLegalFPImmediate(APFloat(+1.0)); // FLD1
601 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
602 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
603 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
604 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
605 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
606 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
609 // We don't support FMA.
610 setOperationAction(ISD::FMA, MVT::f64, Expand);
611 setOperationAction(ISD::FMA, MVT::f32, Expand);
613 // Long double always uses X87, except f128 in MMX.
615 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
616 addRegisterClass(MVT::f128, &X86::VR128RegClass);
617 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
618 setOperationAction(ISD::FABS , MVT::f128, Custom);
619 setOperationAction(ISD::FNEG , MVT::f128, Custom);
620 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
623 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
624 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
625 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
627 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
628 addLegalFPImmediate(TmpFlt); // FLD0
630 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
633 APFloat TmpFlt2(+1.0);
634 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
636 addLegalFPImmediate(TmpFlt2); // FLD1
637 TmpFlt2.changeSign();
638 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
641 // Always expand sin/cos functions even though x87 has an instruction.
642 setOperationAction(ISD::FSIN , MVT::f80, Expand);
643 setOperationAction(ISD::FCOS , MVT::f80, Expand);
644 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
646 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
647 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
648 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
649 setOperationAction(ISD::FRINT, MVT::f80, Expand);
650 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
651 setOperationAction(ISD::FMA, MVT::f80, Expand);
654 // Always use a library call for pow.
655 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
656 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
657 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
659 setOperationAction(ISD::FLOG, MVT::f80, Expand);
660 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
661 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
662 setOperationAction(ISD::FEXP, MVT::f80, Expand);
663 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
664 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
665 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
667 // Some FP actions are always expanded for vector types.
668 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
669 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
670 setOperationAction(ISD::FSIN, VT, Expand);
671 setOperationAction(ISD::FSINCOS, VT, Expand);
672 setOperationAction(ISD::FCOS, VT, Expand);
673 setOperationAction(ISD::FREM, VT, Expand);
674 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
675 setOperationAction(ISD::FPOW, VT, Expand);
676 setOperationAction(ISD::FLOG, VT, Expand);
677 setOperationAction(ISD::FLOG2, VT, Expand);
678 setOperationAction(ISD::FLOG10, VT, Expand);
679 setOperationAction(ISD::FEXP, VT, Expand);
680 setOperationAction(ISD::FEXP2, VT, Expand);
683 // First set operation action for all vector types to either promote
684 // (for widening) or expand (for scalarization). Then we will selectively
685 // turn on ones that can be effectively codegen'd.
686 for (MVT VT : MVT::vector_valuetypes()) {
687 setOperationAction(ISD::SDIV, VT, Expand);
688 setOperationAction(ISD::UDIV, VT, Expand);
689 setOperationAction(ISD::SREM, VT, Expand);
690 setOperationAction(ISD::UREM, VT, Expand);
691 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
692 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
693 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
694 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
695 setOperationAction(ISD::FMA, VT, Expand);
696 setOperationAction(ISD::FFLOOR, VT, Expand);
697 setOperationAction(ISD::FCEIL, VT, Expand);
698 setOperationAction(ISD::FTRUNC, VT, Expand);
699 setOperationAction(ISD::FRINT, VT, Expand);
700 setOperationAction(ISD::FNEARBYINT, VT, Expand);
701 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHS, VT, Expand);
703 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
704 setOperationAction(ISD::MULHU, VT, Expand);
705 setOperationAction(ISD::SDIVREM, VT, Expand);
706 setOperationAction(ISD::UDIVREM, VT, Expand);
707 setOperationAction(ISD::CTPOP, VT, Expand);
708 setOperationAction(ISD::CTTZ, VT, Expand);
709 setOperationAction(ISD::CTLZ, VT, Expand);
710 setOperationAction(ISD::ROTL, VT, Expand);
711 setOperationAction(ISD::ROTR, VT, Expand);
712 setOperationAction(ISD::BSWAP, VT, Expand);
713 setOperationAction(ISD::SETCC, VT, Expand);
714 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
715 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
716 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
717 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
719 setOperationAction(ISD::TRUNCATE, VT, Expand);
720 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
721 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
722 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
723 setOperationAction(ISD::SELECT_CC, VT, Expand);
724 for (MVT InnerVT : MVT::vector_valuetypes()) {
725 setTruncStoreAction(InnerVT, VT, Expand);
727 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
728 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
730 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
731 // types, we have to deal with them whether we ask for Expansion or not.
732 // Setting Expand causes its own optimisation problems though, so leave
734 if (VT.getVectorElementType() == MVT::i1)
735 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
737 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
738 // split/scalarized right now.
739 if (VT.getVectorElementType() == MVT::f16)
740 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
744 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
745 // with -msoft-float, disable use of MMX as well.
746 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
747 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
748 // No operations on x86mmx supported, everything uses intrinsics.
751 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
752 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
755 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
756 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
757 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
758 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
759 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
760 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
762 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
763 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
766 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
767 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
768 : &X86::VR128RegClass);
770 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
771 // registers cannot be used even for integer operations.
772 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
778 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
779 : &X86::VR128RegClass);
781 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
782 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
783 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
784 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
785 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
786 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
787 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
788 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
789 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
790 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
791 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
792 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
793 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
795 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
796 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
797 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
798 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
799 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
802 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
803 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
804 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
806 // Provide custom widening for v2f32 setcc. This is really for VLX when
807 // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
808 // type legalization changing the result type to v4i1 during widening.
809 // It works fine for SSE2 and is probably faster so no need to qualify with
811 setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
813 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
814 setOperationAction(ISD::SETCC, VT, Custom);
815 setOperationAction(ISD::CTPOP, VT, Custom);
816 setOperationAction(ISD::CTTZ, VT, Custom);
818 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
819 // setcc all the way to isel and prefer SETGT in some isel patterns.
820 setCondCodeAction(ISD::SETLT, VT, Custom);
821 setCondCodeAction(ISD::SETLE, VT, Custom);
824 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
825 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
826 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
827 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
828 setOperationAction(ISD::VSELECT, VT, Custom);
829 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
832 // We support custom legalizing of sext and anyext loads for specific
833 // memory vector types which we can load as a scalar (or sequence of
834 // scalars) and extend in-register to a legal 128-bit vector type. For sext
835 // loads these must work with a single scalar load.
836 for (MVT VT : MVT::integer_vector_valuetypes()) {
837 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
838 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
839 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
840 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
841 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
842 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
843 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
844 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
845 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
848 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
849 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
850 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
851 setOperationAction(ISD::VSELECT, VT, Custom);
853 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
856 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
860 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
861 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
862 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
863 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
864 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
865 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
866 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
869 // Custom lower v2i64 and v2f64 selects.
870 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
871 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
873 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
874 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
876 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
877 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
879 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
881 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
882 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
884 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
885 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
887 for (MVT VT : MVT::fp_vector_valuetypes())
888 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
890 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
891 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
892 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
893 if (!Subtarget.hasAVX512())
894 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
896 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
897 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
898 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
900 // In the customized shift lowering, the legal v4i32/v2i64 cases
901 // in AVX2 will be recognized.
902 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
903 setOperationAction(ISD::SRL, VT, Custom);
904 setOperationAction(ISD::SHL, VT, Custom);
905 setOperationAction(ISD::SRA, VT, Custom);
908 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
909 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
910 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
913 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
914 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
915 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
916 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
917 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
918 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
919 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
920 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
921 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
924 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
925 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
926 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
927 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
928 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
929 setOperationAction(ISD::FRINT, RoundedTy, Legal);
930 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
933 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
934 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
935 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
936 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
937 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
938 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
939 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
940 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
942 // FIXME: Do we need to handle scalar-to-vector here?
943 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
945 // We directly match byte blends in the backend as they match the VSELECT
947 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
949 // SSE41 brings specific instructions for doing vector sign extend even in
950 // cases where we don't have SRA.
951 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
952 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
953 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
956 for (MVT VT : MVT::integer_vector_valuetypes()) {
957 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
958 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
959 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
962 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
963 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
964 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
965 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
966 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
967 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
968 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
969 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
970 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
973 // i8 vectors are custom because the source register and source
974 // source memory operand types are not the same width.
975 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
978 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
979 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
980 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
981 setOperationAction(ISD::ROTL, VT, Custom);
983 // XOP can efficiently perform BITREVERSE with VPPERM.
984 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
985 setOperationAction(ISD::BITREVERSE, VT, Custom);
987 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
988 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
989 setOperationAction(ISD::BITREVERSE, VT, Custom);
992 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
993 bool HasInt256 = Subtarget.hasInt256();
995 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
996 : &X86::VR256RegClass);
997 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
998 : &X86::VR256RegClass);
999 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1000 : &X86::VR256RegClass);
1001 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1002 : &X86::VR256RegClass);
1003 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1004 : &X86::VR256RegClass);
1005 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1006 : &X86::VR256RegClass);
1008 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1009 setOperationAction(ISD::FFLOOR, VT, Legal);
1010 setOperationAction(ISD::FCEIL, VT, Legal);
1011 setOperationAction(ISD::FTRUNC, VT, Legal);
1012 setOperationAction(ISD::FRINT, VT, Legal);
1013 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1014 setOperationAction(ISD::FNEG, VT, Custom);
1015 setOperationAction(ISD::FABS, VT, Custom);
1016 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1019 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1020 // even though v8i16 is a legal type.
1021 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1022 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1023 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1025 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1026 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1028 if (!Subtarget.hasAVX512())
1029 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1031 for (MVT VT : MVT::fp_vector_valuetypes())
1032 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1034 // In the customized shift lowering, the legal v8i32/v4i64 cases
1035 // in AVX2 will be recognized.
1036 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1037 setOperationAction(ISD::SRL, VT, Custom);
1038 setOperationAction(ISD::SHL, VT, Custom);
1039 setOperationAction(ISD::SRA, VT, Custom);
1042 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1043 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1044 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1046 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1047 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1048 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1050 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1051 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1052 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1053 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1056 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1057 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1058 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1059 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1061 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1062 setOperationAction(ISD::SETCC, VT, Custom);
1063 setOperationAction(ISD::CTPOP, VT, Custom);
1064 setOperationAction(ISD::CTTZ, VT, Custom);
1065 setOperationAction(ISD::CTLZ, VT, Custom);
1067 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1068 // setcc all the way to isel and prefer SETGT in some isel patterns.
1069 setCondCodeAction(ISD::SETLT, VT, Custom);
1070 setCondCodeAction(ISD::SETLE, VT, Custom);
1073 if (Subtarget.hasAnyFMA()) {
1074 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1075 MVT::v2f64, MVT::v4f64 })
1076 setOperationAction(ISD::FMA, VT, Legal);
1079 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1080 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1081 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1084 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1085 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1086 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1087 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1089 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1090 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1092 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1093 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1094 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1095 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1097 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1098 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1099 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1100 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1102 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1103 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1104 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1105 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1106 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1107 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1111 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1112 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1113 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1115 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1116 // when we have a 256bit-wide blend with immediate.
1117 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1119 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1120 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1121 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1122 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1123 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1124 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1125 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1126 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1130 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1131 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1132 setOperationAction(ISD::MLOAD, VT, Legal);
1133 setOperationAction(ISD::MSTORE, VT, Legal);
1136 // Extract subvector is special because the value type
1137 // (result) is 128-bit but the source is 256-bit wide.
1138 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1139 MVT::v4f32, MVT::v2f64 }) {
1140 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1143 // Custom lower several nodes for 256-bit types.
1144 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1145 MVT::v8f32, MVT::v4f64 }) {
1146 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1147 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1148 setOperationAction(ISD::VSELECT, VT, Custom);
1149 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1150 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1151 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1152 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1153 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1157 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1159 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1160 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1161 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1162 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1163 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1164 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1165 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1169 // Custom legalize 2x32 to get a little better code.
1170 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1171 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1173 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1174 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1175 setOperationAction(ISD::MGATHER, VT, Custom);
1179 // This block controls legalization of the mask vector sizes that are
1180 // available with AVX512. 512-bit vectors are in a separate block controlled
1181 // by useAVX512Regs.
1182 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1183 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1184 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1185 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1186 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1187 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1189 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1190 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1191 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1193 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1194 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1195 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1196 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1197 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1198 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1200 // There is no byte sized k-register load or store without AVX512DQ.
1201 if (!Subtarget.hasDQI()) {
1202 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1203 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1204 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1205 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1207 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1208 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1209 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1210 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1213 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1214 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1215 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1216 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1217 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1220 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1221 setOperationAction(ISD::ADD, VT, Custom);
1222 setOperationAction(ISD::SUB, VT, Custom);
1223 setOperationAction(ISD::MUL, VT, Custom);
1224 setOperationAction(ISD::SETCC, VT, Custom);
1225 setOperationAction(ISD::SELECT, VT, Custom);
1226 setOperationAction(ISD::TRUNCATE, VT, Custom);
1228 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1229 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1230 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1231 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1232 setOperationAction(ISD::VSELECT, VT, Expand);
1235 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1236 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1237 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1238 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
1239 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1240 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1241 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1242 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1243 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1246 // This block controls legalization for 512-bit operations with 32/64 bit
1247 // elements. 512-bits can be disabled based on prefer-vector-width and
1248 // required-vector-width function attributes.
1249 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1250 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1251 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1252 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1253 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1255 for (MVT VT : MVT::fp_vector_valuetypes())
1256 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1258 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1259 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1260 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1261 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1262 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1263 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1266 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1267 setOperationAction(ISD::FNEG, VT, Custom);
1268 setOperationAction(ISD::FABS, VT, Custom);
1269 setOperationAction(ISD::FMA, VT, Legal);
1270 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1273 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1274 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1275 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1276 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1277 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1278 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1279 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1280 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1281 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1282 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1284 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1285 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1286 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1287 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1288 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1290 if (!Subtarget.hasVLX()) {
1291 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1292 // to 512-bit rather than use the AVX2 instructions so that we can use
1294 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1295 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1296 setOperationAction(ISD::MLOAD, VT, Custom);
1297 setOperationAction(ISD::MSTORE, VT, Custom);
1301 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1302 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1303 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1304 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1305 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1306 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1310 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1311 setOperationAction(ISD::FFLOOR, VT, Legal);
1312 setOperationAction(ISD::FCEIL, VT, Legal);
1313 setOperationAction(ISD::FTRUNC, VT, Legal);
1314 setOperationAction(ISD::FRINT, VT, Legal);
1315 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1318 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1319 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1321 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1322 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1323 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1325 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1326 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1327 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1328 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1330 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1331 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1333 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1334 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1336 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1337 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1338 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1340 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1341 setOperationAction(ISD::SMAX, VT, Legal);
1342 setOperationAction(ISD::UMAX, VT, Legal);
1343 setOperationAction(ISD::SMIN, VT, Legal);
1344 setOperationAction(ISD::UMIN, VT, Legal);
1345 setOperationAction(ISD::ABS, VT, Legal);
1346 setOperationAction(ISD::SRL, VT, Custom);
1347 setOperationAction(ISD::SHL, VT, Custom);
1348 setOperationAction(ISD::SRA, VT, Custom);
1349 setOperationAction(ISD::CTPOP, VT, Custom);
1350 setOperationAction(ISD::CTTZ, VT, Custom);
1351 setOperationAction(ISD::ROTL, VT, Custom);
1352 setOperationAction(ISD::ROTR, VT, Custom);
1353 setOperationAction(ISD::SETCC, VT, Custom);
1355 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1356 // setcc all the way to isel and prefer SETGT in some isel patterns.
1357 setCondCodeAction(ISD::SETLT, VT, Custom);
1358 setCondCodeAction(ISD::SETLE, VT, Custom);
1361 // Need to promote to 64-bit even though we have 32-bit masked instructions
1362 // because the IR optimizers rearrange bitcasts around logic ops leaving
1363 // too many variations to handle if we don't promote them.
1364 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1365 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1366 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1368 if (Subtarget.hasDQI()) {
1369 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1370 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1371 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1372 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1374 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1377 if (Subtarget.hasCDI()) {
1378 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1379 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1380 setOperationAction(ISD::CTLZ, VT, Legal);
1381 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1383 } // Subtarget.hasCDI()
1385 if (Subtarget.hasVPOPCNTDQ()) {
1386 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1387 setOperationAction(ISD::CTPOP, VT, Legal);
1390 // Extract subvector is special because the value type
1391 // (result) is 256-bit but the source is 512-bit wide.
1392 // 128-bit was made Legal under AVX1.
1393 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1394 MVT::v8f32, MVT::v4f64 })
1395 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1397 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1398 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1399 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1400 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1401 setOperationAction(ISD::VSELECT, VT, Custom);
1402 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1403 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1404 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1405 setOperationAction(ISD::MLOAD, VT, Legal);
1406 setOperationAction(ISD::MSTORE, VT, Legal);
1407 setOperationAction(ISD::MGATHER, VT, Custom);
1408 setOperationAction(ISD::MSCATTER, VT, Custom);
1410 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1411 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1412 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1415 // Need to custom split v32i16/v64i8 bitcasts.
1416 if (!Subtarget.hasBWI()) {
1417 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1418 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1422 // This block controls legalization for operations that don't have
1423 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1425 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1426 // These operations are handled on non-VLX by artificially widening in
1428 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1430 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1431 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1432 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1433 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1434 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1436 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1437 setOperationAction(ISD::SMAX, VT, Legal);
1438 setOperationAction(ISD::UMAX, VT, Legal);
1439 setOperationAction(ISD::SMIN, VT, Legal);
1440 setOperationAction(ISD::UMIN, VT, Legal);
1441 setOperationAction(ISD::ABS, VT, Legal);
1444 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1445 setOperationAction(ISD::ROTL, VT, Custom);
1446 setOperationAction(ISD::ROTR, VT, Custom);
1449 // Custom legalize 2x32 to get a little better code.
1450 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1451 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1453 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1454 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1455 setOperationAction(ISD::MSCATTER, VT, Custom);
1457 if (Subtarget.hasDQI()) {
1458 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1459 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1460 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1461 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1462 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1464 setOperationAction(ISD::MUL, VT, Legal);
1468 if (Subtarget.hasCDI()) {
1469 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1470 setOperationAction(ISD::CTLZ, VT, Legal);
1471 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1473 } // Subtarget.hasCDI()
1475 if (Subtarget.hasVPOPCNTDQ()) {
1476 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1477 setOperationAction(ISD::CTPOP, VT, Legal);
1481 // This block control legalization of v32i1/v64i1 which are available with
1482 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1484 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1485 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1486 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1488 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1489 setOperationAction(ISD::ADD, VT, Custom);
1490 setOperationAction(ISD::SUB, VT, Custom);
1491 setOperationAction(ISD::MUL, VT, Custom);
1492 setOperationAction(ISD::VSELECT, VT, Expand);
1494 setOperationAction(ISD::TRUNCATE, VT, Custom);
1495 setOperationAction(ISD::SETCC, VT, Custom);
1496 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1497 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1498 setOperationAction(ISD::SELECT, VT, Custom);
1499 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1500 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1503 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1504 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1505 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1506 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1507 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1508 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1510 // Extends from v32i1 masks to 256-bit vectors.
1511 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1512 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1513 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1516 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1517 // disabled based on prefer-vector-width and required-vector-width function
1519 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1520 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1521 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1523 // Extends from v64i1 masks to 512-bit vectors.
1524 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1525 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1526 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1528 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1529 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1530 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1531 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1532 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1533 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1534 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1535 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1536 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1537 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1538 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1539 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1540 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1541 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1542 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1543 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1544 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1545 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1546 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1547 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1548 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1549 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1550 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1552 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1554 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1556 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1557 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1558 setOperationAction(ISD::VSELECT, VT, Custom);
1559 setOperationAction(ISD::ABS, VT, Legal);
1560 setOperationAction(ISD::SRL, VT, Custom);
1561 setOperationAction(ISD::SHL, VT, Custom);
1562 setOperationAction(ISD::SRA, VT, Custom);
1563 setOperationAction(ISD::MLOAD, VT, Legal);
1564 setOperationAction(ISD::MSTORE, VT, Legal);
1565 setOperationAction(ISD::CTPOP, VT, Custom);
1566 setOperationAction(ISD::CTTZ, VT, Custom);
1567 setOperationAction(ISD::CTLZ, VT, Custom);
1568 setOperationAction(ISD::SMAX, VT, Legal);
1569 setOperationAction(ISD::UMAX, VT, Legal);
1570 setOperationAction(ISD::SMIN, VT, Legal);
1571 setOperationAction(ISD::UMIN, VT, Legal);
1572 setOperationAction(ISD::SETCC, VT, Custom);
1574 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1575 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1576 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1579 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1580 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1583 if (Subtarget.hasBITALG()) {
1584 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1585 setOperationAction(ISD::CTPOP, VT, Legal);
1589 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1590 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1591 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1592 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1595 // These operations are handled on non-VLX by artificially widening in
1597 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1599 if (Subtarget.hasBITALG()) {
1600 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1601 setOperationAction(ISD::CTPOP, VT, Legal);
1605 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1606 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1607 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1608 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1609 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1610 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1612 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1613 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1614 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1615 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1616 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1618 if (Subtarget.hasDQI()) {
1619 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1620 // v2f32 UINT_TO_FP is already custom under SSE2.
1621 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1622 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1623 "Unexpected operation action!");
1624 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1625 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1626 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1629 if (Subtarget.hasBWI()) {
1630 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1631 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1635 // We want to custom lower some of our intrinsics.
1636 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1637 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1638 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1639 if (!Subtarget.is64Bit()) {
1640 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1641 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1644 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1645 // handle type legalization for these operations here.
1647 // FIXME: We really should do custom legalization for addition and
1648 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1649 // than generic legalization for 64-bit multiplication-with-overflow, though.
1650 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1651 if (VT == MVT::i64 && !Subtarget.is64Bit())
1653 // Add/Sub/Mul with overflow operations are custom lowered.
1654 setOperationAction(ISD::SADDO, VT, Custom);
1655 setOperationAction(ISD::UADDO, VT, Custom);
1656 setOperationAction(ISD::SSUBO, VT, Custom);
1657 setOperationAction(ISD::USUBO, VT, Custom);
1658 setOperationAction(ISD::SMULO, VT, Custom);
1659 setOperationAction(ISD::UMULO, VT, Custom);
1661 // Support carry in as value rather than glue.
1662 setOperationAction(ISD::ADDCARRY, VT, Custom);
1663 setOperationAction(ISD::SUBCARRY, VT, Custom);
1664 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1667 if (!Subtarget.is64Bit()) {
1668 // These libcalls are not available in 32-bit.
1669 setLibcallName(RTLIB::SHL_I128, nullptr);
1670 setLibcallName(RTLIB::SRL_I128, nullptr);
1671 setLibcallName(RTLIB::SRA_I128, nullptr);
1672 setLibcallName(RTLIB::MUL_I128, nullptr);
1675 // Combine sin / cos into _sincos_stret if it is available.
1676 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1677 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1678 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1679 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1682 if (Subtarget.isTargetWin64()) {
1683 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1684 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1685 setOperationAction(ISD::SREM, MVT::i128, Custom);
1686 setOperationAction(ISD::UREM, MVT::i128, Custom);
1687 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1688 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1691 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1692 // is. We should promote the value to 64-bits to solve this.
1693 // This is what the CRT headers do - `fmodf` is an inline header
1694 // function casting to f64 and calling `fmod`.
1695 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1696 Subtarget.isTargetWindowsItanium()))
1697 for (ISD::NodeType Op :
1698 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1699 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1700 if (isOperationExpand(Op, MVT::f32))
1701 setOperationAction(Op, MVT::f32, Promote);
1703 // We have target-specific dag combine patterns for the following nodes:
1704 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1705 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1706 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1707 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1708 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1709 setTargetDAGCombine(ISD::BITCAST);
1710 setTargetDAGCombine(ISD::VSELECT);
1711 setTargetDAGCombine(ISD::SELECT);
1712 setTargetDAGCombine(ISD::SHL);
1713 setTargetDAGCombine(ISD::SRA);
1714 setTargetDAGCombine(ISD::SRL);
1715 setTargetDAGCombine(ISD::OR);
1716 setTargetDAGCombine(ISD::AND);
1717 setTargetDAGCombine(ISD::ADD);
1718 setTargetDAGCombine(ISD::FADD);
1719 setTargetDAGCombine(ISD::FSUB);
1720 setTargetDAGCombine(ISD::FNEG);
1721 setTargetDAGCombine(ISD::FMA);
1722 setTargetDAGCombine(ISD::FMINNUM);
1723 setTargetDAGCombine(ISD::FMAXNUM);
1724 setTargetDAGCombine(ISD::SUB);
1725 setTargetDAGCombine(ISD::LOAD);
1726 setTargetDAGCombine(ISD::MLOAD);
1727 setTargetDAGCombine(ISD::STORE);
1728 setTargetDAGCombine(ISD::MSTORE);
1729 setTargetDAGCombine(ISD::TRUNCATE);
1730 setTargetDAGCombine(ISD::ZERO_EXTEND);
1731 setTargetDAGCombine(ISD::ANY_EXTEND);
1732 setTargetDAGCombine(ISD::SIGN_EXTEND);
1733 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1734 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1735 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1736 setTargetDAGCombine(ISD::SINT_TO_FP);
1737 setTargetDAGCombine(ISD::UINT_TO_FP);
1738 setTargetDAGCombine(ISD::SETCC);
1739 setTargetDAGCombine(ISD::MUL);
1740 setTargetDAGCombine(ISD::XOR);
1741 setTargetDAGCombine(ISD::MSCATTER);
1742 setTargetDAGCombine(ISD::MGATHER);
1744 computeRegisterProperties(Subtarget.getRegisterInfo());
1746 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1747 MaxStoresPerMemsetOptSize = 8;
1748 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1749 MaxStoresPerMemcpyOptSize = 4;
1750 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1751 MaxStoresPerMemmoveOptSize = 4;
1753 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1754 // that needs to benchmarked and balanced with the potential use of vector
1755 // load/store types (PR33329, PR33914).
1756 MaxLoadsPerMemcmp = 2;
1757 MaxLoadsPerMemcmpOptSize = 2;
1759 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1760 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1762 // An out-of-order CPU can speculatively execute past a predictable branch,
1763 // but a conditional move could be stalled by an expensive earlier operation.
1764 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1765 EnableExtLdPromotion = true;
1766 setPrefFunctionAlignment(4); // 2^4 bytes.
1768 verifyIntrinsicTables();
1771 // This has so far only been implemented for 64-bit MachO.
1772 bool X86TargetLowering::useLoadStackGuardNode() const {
1773 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1776 bool X86TargetLowering::useStackGuardXorFP() const {
1777 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1778 return Subtarget.getTargetTriple().isOSMSVCRT();
1781 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1782 const SDLoc &DL) const {
1783 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1784 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1785 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1786 return SDValue(Node, 0);
1789 TargetLoweringBase::LegalizeTypeAction
1790 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1791 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1792 return TypeSplitVector;
1794 if (ExperimentalVectorWideningLegalization &&
1795 VT.getVectorNumElements() != 1 &&
1796 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1797 return TypeWidenVector;
1799 return TargetLoweringBase::getPreferredVectorAction(VT);
1802 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1804 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1806 return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
1809 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1811 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1813 return TargetLowering::getNumRegistersForCallingConv(Context, VT);
1816 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1817 LLVMContext& Context,
1822 if (Subtarget.hasAVX512()) {
1823 const unsigned NumElts = VT.getVectorNumElements();
1825 // Figure out what this type will be legalized to.
1827 while (getTypeAction(Context, LegalVT) != TypeLegal)
1828 LegalVT = getTypeToTransformTo(Context, LegalVT);
1830 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1831 if (LegalVT.getSimpleVT().is512BitVector())
1832 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1834 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1835 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1836 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1838 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1839 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1840 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1844 return VT.changeVectorElementTypeToInteger();
1847 /// Helper for getByValTypeAlignment to determine
1848 /// the desired ByVal argument alignment.
1849 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1852 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1853 if (VTy->getBitWidth() == 128)
1855 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1856 unsigned EltAlign = 0;
1857 getMaxByValAlign(ATy->getElementType(), EltAlign);
1858 if (EltAlign > MaxAlign)
1859 MaxAlign = EltAlign;
1860 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1861 for (auto *EltTy : STy->elements()) {
1862 unsigned EltAlign = 0;
1863 getMaxByValAlign(EltTy, EltAlign);
1864 if (EltAlign > MaxAlign)
1865 MaxAlign = EltAlign;
1872 /// Return the desired alignment for ByVal aggregate
1873 /// function arguments in the caller parameter area. For X86, aggregates
1874 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1875 /// are at 4-byte boundaries.
1876 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1877 const DataLayout &DL) const {
1878 if (Subtarget.is64Bit()) {
1879 // Max of 8 and alignment of type.
1880 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1887 if (Subtarget.hasSSE1())
1888 getMaxByValAlign(Ty, Align);
1892 /// Returns the target specific optimal type for load
1893 /// and store operations as a result of memset, memcpy, and memmove
1894 /// lowering. If DstAlign is zero that means it's safe to destination
1895 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1896 /// means there isn't a need to check it against alignment requirement,
1897 /// probably because the source does not need to be loaded. If 'IsMemset' is
1898 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1899 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1900 /// source is constant so it does not need to be loaded.
1901 /// It returns EVT::Other if the type should be determined using generic
1902 /// target-independent logic.
1904 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1905 unsigned DstAlign, unsigned SrcAlign,
1906 bool IsMemset, bool ZeroMemset,
1908 MachineFunction &MF) const {
1909 const Function &F = MF.getFunction();
1910 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1912 (!Subtarget.isUnalignedMem16Slow() ||
1913 ((DstAlign == 0 || DstAlign >= 16) &&
1914 (SrcAlign == 0 || SrcAlign >= 16)))) {
1915 // FIXME: Check if unaligned 32-byte accesses are slow.
1916 if (Size >= 32 && Subtarget.hasAVX()) {
1917 // Although this isn't a well-supported type for AVX1, we'll let
1918 // legalization and shuffle lowering produce the optimal codegen. If we
1919 // choose an optimal type with a vector element larger than a byte,
1920 // getMemsetStores() may create an intermediate splat (using an integer
1921 // multiply) before we splat as a vector.
1924 if (Subtarget.hasSSE2())
1926 // TODO: Can SSE1 handle a byte vector?
1927 if (Subtarget.hasSSE1())
1929 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1930 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1931 // Do not use f64 to lower memcpy if source is string constant. It's
1932 // better to use i32 to avoid the loads.
1933 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1934 // The gymnastics of splatting a byte value into an XMM register and then
1935 // only using 8-byte stores (because this is a CPU with slow unaligned
1936 // 16-byte accesses) makes that a loser.
1940 // This is a compromise. If we reach here, unaligned accesses may be slow on
1941 // this target. However, creating smaller, aligned accesses could be even
1942 // slower and would certainly be a lot more code.
1943 if (Subtarget.is64Bit() && Size >= 8)
1948 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1950 return X86ScalarSSEf32;
1951 else if (VT == MVT::f64)
1952 return X86ScalarSSEf64;
1957 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1962 switch (VT.getSizeInBits()) {
1964 // 8-byte and under are always assumed to be fast.
1968 *Fast = !Subtarget.isUnalignedMem16Slow();
1971 *Fast = !Subtarget.isUnalignedMem32Slow();
1973 // TODO: What about AVX-512 (512-bit) accesses?
1976 // Misaligned accesses of any size are always allowed.
1980 /// Return the entry encoding for a jump table in the
1981 /// current function. The returned value is a member of the
1982 /// MachineJumpTableInfo::JTEntryKind enum.
1983 unsigned X86TargetLowering::getJumpTableEncoding() const {
1984 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1986 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1987 return MachineJumpTableInfo::EK_Custom32;
1989 // Otherwise, use the normal jump table encoding heuristics.
1990 return TargetLowering::getJumpTableEncoding();
1993 bool X86TargetLowering::useSoftFloat() const {
1994 return Subtarget.useSoftFloat();
1997 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1998 ArgListTy &Args) const {
2000 // Only relabel X86-32 for C / Stdcall CCs.
2001 if (Subtarget.is64Bit())
2003 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2005 unsigned ParamRegs = 0;
2006 if (auto *M = MF->getFunction().getParent())
2007 ParamRegs = M->getNumberRegisterParameters();
2009 // Mark the first N int arguments as having reg
2010 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2011 Type *T = Args[Idx].Ty;
2012 if (T->isIntOrPtrTy())
2013 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2014 unsigned numRegs = 1;
2015 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2017 if (ParamRegs < numRegs)
2019 ParamRegs -= numRegs;
2020 Args[Idx].IsInReg = true;
2026 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2027 const MachineBasicBlock *MBB,
2028 unsigned uid,MCContext &Ctx) const{
2029 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2030 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2032 return MCSymbolRefExpr::create(MBB->getSymbol(),
2033 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2036 /// Returns relocation base for the given PIC jumptable.
2037 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2038 SelectionDAG &DAG) const {
2039 if (!Subtarget.is64Bit())
2040 // This doesn't have SDLoc associated with it, but is not really the
2041 // same as a Register.
2042 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2043 getPointerTy(DAG.getDataLayout()));
2047 /// This returns the relocation base for the given PIC jumptable,
2048 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2049 const MCExpr *X86TargetLowering::
2050 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2051 MCContext &Ctx) const {
2052 // X86-64 uses RIP relative addressing based on the jump table label.
2053 if (Subtarget.isPICStyleRIPRel())
2054 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2056 // Otherwise, the reference is relative to the PIC base.
2057 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2060 std::pair<const TargetRegisterClass *, uint8_t>
2061 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2063 const TargetRegisterClass *RRC = nullptr;
2065 switch (VT.SimpleTy) {
2067 return TargetLowering::findRepresentativeClass(TRI, VT);
2068 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2069 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2072 RRC = &X86::VR64RegClass;
2074 case MVT::f32: case MVT::f64:
2075 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2076 case MVT::v4f32: case MVT::v2f64:
2077 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2078 case MVT::v8f32: case MVT::v4f64:
2079 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2080 case MVT::v16f32: case MVT::v8f64:
2081 RRC = &X86::VR128XRegClass;
2084 return std::make_pair(RRC, Cost);
2087 unsigned X86TargetLowering::getAddressSpace() const {
2088 if (Subtarget.is64Bit())
2089 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2093 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2094 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2095 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2098 static Constant* SegmentOffset(IRBuilder<> &IRB,
2099 unsigned Offset, unsigned AddressSpace) {
2100 return ConstantExpr::getIntToPtr(
2101 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2102 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2105 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2106 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2107 // tcbhead_t; use it instead of the usual global variable (see
2108 // sysdeps/{i386,x86_64}/nptl/tls.h)
2109 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2110 if (Subtarget.isTargetFuchsia()) {
2111 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2112 return SegmentOffset(IRB, 0x10, getAddressSpace());
2114 // %fs:0x28, unless we're using a Kernel code model, in which case
2115 // it's %gs:0x28. gs:0x14 on i386.
2116 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2117 return SegmentOffset(IRB, Offset, getAddressSpace());
2121 return TargetLowering::getIRStackGuard(IRB);
2124 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2125 // MSVC CRT provides functionalities for stack protection.
2126 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2127 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2128 // MSVC CRT has a global variable holding security cookie.
2129 M.getOrInsertGlobal("__security_cookie",
2130 Type::getInt8PtrTy(M.getContext()));
2132 // MSVC CRT has a function to validate security cookie.
2133 auto *SecurityCheckCookie = cast<Function>(
2134 M.getOrInsertFunction("__security_check_cookie",
2135 Type::getVoidTy(M.getContext()),
2136 Type::getInt8PtrTy(M.getContext())));
2137 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2138 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2141 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2142 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2144 TargetLowering::insertSSPDeclarations(M);
2147 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2148 // MSVC CRT has a global variable holding security cookie.
2149 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2150 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2151 return M.getGlobalVariable("__security_cookie");
2153 return TargetLowering::getSDagStackGuard(M);
2156 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2157 // MSVC CRT has a function to validate security cookie.
2158 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2159 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2160 return M.getFunction("__security_check_cookie");
2162 return TargetLowering::getSSPStackGuardCheck(M);
2165 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2166 if (Subtarget.getTargetTriple().isOSContiki())
2167 return getDefaultSafeStackPointerLocation(IRB, false);
2169 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2170 // definition of TLS_SLOT_SAFESTACK in
2171 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2172 if (Subtarget.isTargetAndroid()) {
2173 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2175 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2176 return SegmentOffset(IRB, Offset, getAddressSpace());
2179 // Fuchsia is similar.
2180 if (Subtarget.isTargetFuchsia()) {
2181 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2182 return SegmentOffset(IRB, 0x18, getAddressSpace());
2185 return TargetLowering::getSafeStackPointerLocation(IRB);
2188 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2189 unsigned DestAS) const {
2190 assert(SrcAS != DestAS && "Expected different address spaces!");
2192 return SrcAS < 256 && DestAS < 256;
2195 //===----------------------------------------------------------------------===//
2196 // Return Value Calling Convention Implementation
2197 //===----------------------------------------------------------------------===//
2199 #include "X86GenCallingConv.inc"
2201 bool X86TargetLowering::CanLowerReturn(
2202 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2203 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2204 SmallVector<CCValAssign, 16> RVLocs;
2205 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2206 return CCInfo.CheckReturn(Outs, RetCC_X86);
2209 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2210 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2214 /// Lowers masks values (v*i1) to the local register values
2215 /// \returns DAG node after lowering to register type
2216 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2217 const SDLoc &Dl, SelectionDAG &DAG) {
2218 EVT ValVT = ValArg.getValueType();
2220 if (ValVT == MVT::v1i1)
2221 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2222 DAG.getIntPtrConstant(0, Dl));
2224 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2225 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2226 // Two stage lowering might be required
2227 // bitcast: v8i1 -> i8 / v16i1 -> i16
2228 // anyextend: i8 -> i32 / i16 -> i32
2229 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2230 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2231 if (ValLoc == MVT::i32)
2232 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2236 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2237 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2238 // One stage lowering is required
2239 // bitcast: v32i1 -> i32 / v64i1 -> i64
2240 return DAG.getBitcast(ValLoc, ValArg);
2243 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2246 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2247 static void Passv64i1ArgInRegs(
2248 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2249 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2250 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2251 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2252 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2253 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2254 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2255 "The value should reside in two registers");
2257 // Before splitting the value we cast it to i64
2258 Arg = DAG.getBitcast(MVT::i64, Arg);
2260 // Splitting the value into two i32 types
2262 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2263 DAG.getConstant(0, Dl, MVT::i32));
2264 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2265 DAG.getConstant(1, Dl, MVT::i32));
2267 // Attach the two i32 types into corresponding registers
2268 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2269 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2273 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2275 const SmallVectorImpl<ISD::OutputArg> &Outs,
2276 const SmallVectorImpl<SDValue> &OutVals,
2277 const SDLoc &dl, SelectionDAG &DAG) const {
2278 MachineFunction &MF = DAG.getMachineFunction();
2279 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2281 // In some cases we need to disable registers from the default CSR list.
2282 // For example, when they are used for argument passing.
2283 bool ShouldDisableCalleeSavedRegister =
2284 CallConv == CallingConv::X86_RegCall ||
2285 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2287 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2288 report_fatal_error("X86 interrupts may not return any value");
2290 SmallVector<CCValAssign, 16> RVLocs;
2291 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2292 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2295 SmallVector<SDValue, 6> RetOps;
2296 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2297 // Operand #1 = Bytes To Pop
2298 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2301 // Copy the result values into the output registers.
2302 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2304 CCValAssign &VA = RVLocs[I];
2305 assert(VA.isRegLoc() && "Can only return in registers!");
2307 // Add the register to the CalleeSaveDisableRegs list.
2308 if (ShouldDisableCalleeSavedRegister)
2309 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2311 SDValue ValToCopy = OutVals[OutsIndex];
2312 EVT ValVT = ValToCopy.getValueType();
2314 // Promote values to the appropriate types.
2315 if (VA.getLocInfo() == CCValAssign::SExt)
2316 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2317 else if (VA.getLocInfo() == CCValAssign::ZExt)
2318 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2319 else if (VA.getLocInfo() == CCValAssign::AExt) {
2320 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2321 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2323 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2325 else if (VA.getLocInfo() == CCValAssign::BCvt)
2326 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2328 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2329 "Unexpected FP-extend for return value.");
2331 // If this is x86-64, and we disabled SSE, we can't return FP values,
2332 // or SSE or MMX vectors.
2333 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2334 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2335 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2336 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2337 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2338 } else if (ValVT == MVT::f64 &&
2339 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2340 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2341 // llvm-gcc has never done it right and no one has noticed, so this
2342 // should be OK for now.
2343 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2344 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2347 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2348 // the RET instruction and handled by the FP Stackifier.
2349 if (VA.getLocReg() == X86::FP0 ||
2350 VA.getLocReg() == X86::FP1) {
2351 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2352 // change the value to the FP stack register class.
2353 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2354 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2355 RetOps.push_back(ValToCopy);
2356 // Don't emit a copytoreg.
2360 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2361 // which is returned in RAX / RDX.
2362 if (Subtarget.is64Bit()) {
2363 if (ValVT == MVT::x86mmx) {
2364 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2365 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2366 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2368 // If we don't have SSE2 available, convert to v4f32 so the generated
2369 // register is legal.
2370 if (!Subtarget.hasSSE2())
2371 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2376 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2378 if (VA.needsCustom()) {
2379 assert(VA.getValVT() == MVT::v64i1 &&
2380 "Currently the only custom case is when we split v64i1 to 2 regs");
2382 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2385 assert(2 == RegsToPass.size() &&
2386 "Expecting two registers after Pass64BitArgInRegs");
2388 // Add the second register to the CalleeSaveDisableRegs list.
2389 if (ShouldDisableCalleeSavedRegister)
2390 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2392 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2395 // Add nodes to the DAG and add the values into the RetOps list
2396 for (auto &Reg : RegsToPass) {
2397 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2398 Flag = Chain.getValue(1);
2399 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2403 // Swift calling convention does not require we copy the sret argument
2404 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2406 // All x86 ABIs require that for returning structs by value we copy
2407 // the sret argument into %rax/%eax (depending on ABI) for the return.
2408 // We saved the argument into a virtual register in the entry block,
2409 // so now we copy the value out and into %rax/%eax.
2411 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2412 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2413 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2414 // either case FuncInfo->setSRetReturnReg() will have been called.
2415 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2416 // When we have both sret and another return value, we should use the
2417 // original Chain stored in RetOps[0], instead of the current Chain updated
2418 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2420 // For the case of sret and another return value, we have
2421 // Chain_0 at the function entry
2422 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2423 // If we use Chain_1 in getCopyFromReg, we will have
2424 // Val = getCopyFromReg(Chain_1)
2425 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2427 // getCopyToReg(Chain_0) will be glued together with
2428 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2429 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2430 // Data dependency from Unit B to Unit A due to usage of Val in
2431 // getCopyToReg(Chain_1, Val)
2432 // Chain dependency from Unit A to Unit B
2434 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2435 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2436 getPointerTy(MF.getDataLayout()));
2439 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2440 X86::RAX : X86::EAX;
2441 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2442 Flag = Chain.getValue(1);
2444 // RAX/EAX now acts like a return value.
2446 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2448 // Add the returned register to the CalleeSaveDisableRegs list.
2449 if (ShouldDisableCalleeSavedRegister)
2450 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2453 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2454 const MCPhysReg *I =
2455 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2458 if (X86::GR64RegClass.contains(*I))
2459 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2461 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2465 RetOps[0] = Chain; // Update chain.
2467 // Add the flag if we have it.
2469 RetOps.push_back(Flag);
2471 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2472 if (CallConv == CallingConv::X86_INTR)
2473 opcode = X86ISD::IRET;
2474 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2477 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2478 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2481 SDValue TCChain = Chain;
2482 SDNode *Copy = *N->use_begin();
2483 if (Copy->getOpcode() == ISD::CopyToReg) {
2484 // If the copy has a glue operand, we conservatively assume it isn't safe to
2485 // perform a tail call.
2486 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2488 TCChain = Copy->getOperand(0);
2489 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2492 bool HasRet = false;
2493 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2495 if (UI->getOpcode() != X86ISD::RET_FLAG)
2497 // If we are returning more than one value, we can definitely
2498 // not make a tail call see PR19530
2499 if (UI->getNumOperands() > 4)
2501 if (UI->getNumOperands() == 4 &&
2502 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2514 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2515 ISD::NodeType ExtendKind) const {
2516 MVT ReturnMVT = MVT::i32;
2518 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2519 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2520 // The ABI does not require i1, i8 or i16 to be extended.
2522 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2523 // always extending i8/i16 return values, so keep doing that for now.
2525 ReturnMVT = MVT::i8;
2528 EVT MinVT = getRegisterType(Context, ReturnMVT);
2529 return VT.bitsLT(MinVT) ? MinVT : VT;
2532 /// Reads two 32 bit registers and creates a 64 bit mask value.
2533 /// \param VA The current 32 bit value that need to be assigned.
2534 /// \param NextVA The next 32 bit value that need to be assigned.
2535 /// \param Root The parent DAG node.
2536 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2537 /// glue purposes. In the case the DAG is already using
2538 /// physical register instead of virtual, we should glue
2539 /// our new SDValue to InFlag SDvalue.
2540 /// \return a new SDvalue of size 64bit.
2541 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2542 SDValue &Root, SelectionDAG &DAG,
2543 const SDLoc &Dl, const X86Subtarget &Subtarget,
2544 SDValue *InFlag = nullptr) {
2545 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2546 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2547 assert(VA.getValVT() == MVT::v64i1 &&
2548 "Expecting first location of 64 bit width type");
2549 assert(NextVA.getValVT() == VA.getValVT() &&
2550 "The locations should have the same type");
2551 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2552 "The values should reside in two registers");
2556 SDValue ArgValueLo, ArgValueHi;
2558 MachineFunction &MF = DAG.getMachineFunction();
2559 const TargetRegisterClass *RC = &X86::GR32RegClass;
2561 // Read a 32 bit value from the registers.
2562 if (nullptr == InFlag) {
2563 // When no physical register is present,
2564 // create an intermediate virtual register.
2565 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2566 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2567 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2568 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2570 // When a physical register is available read the value from it and glue
2571 // the reads together.
2573 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2574 *InFlag = ArgValueLo.getValue(2);
2576 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2577 *InFlag = ArgValueHi.getValue(2);
2580 // Convert the i32 type into v32i1 type.
2581 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2583 // Convert the i32 type into v32i1 type.
2584 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2586 // Concatenate the two values together.
2587 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2590 /// The function will lower a register of various sizes (8/16/32/64)
2591 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2592 /// \returns a DAG node contains the operand after lowering to mask type.
2593 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2594 const EVT &ValLoc, const SDLoc &Dl,
2595 SelectionDAG &DAG) {
2596 SDValue ValReturned = ValArg;
2598 if (ValVT == MVT::v1i1)
2599 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2601 if (ValVT == MVT::v64i1) {
2602 // In 32 bit machine, this case is handled by getv64i1Argument
2603 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2604 // In 64 bit machine, There is no need to truncate the value only bitcast
2607 switch (ValVT.getSimpleVT().SimpleTy) {
2618 llvm_unreachable("Expecting a vector of i1 types");
2621 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2623 return DAG.getBitcast(ValVT, ValReturned);
2626 /// Lower the result values of a call into the
2627 /// appropriate copies out of appropriate physical registers.
2629 SDValue X86TargetLowering::LowerCallResult(
2630 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2631 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2632 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2633 uint32_t *RegMask) const {
2635 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2636 // Assign locations to each value returned by this call.
2637 SmallVector<CCValAssign, 16> RVLocs;
2638 bool Is64Bit = Subtarget.is64Bit();
2639 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2641 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2643 // Copy all of the result registers out of their specified physreg.
2644 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2646 CCValAssign &VA = RVLocs[I];
2647 EVT CopyVT = VA.getLocVT();
2649 // In some calling conventions we need to remove the used registers
2650 // from the register mask.
2652 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2653 SubRegs.isValid(); ++SubRegs)
2654 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2657 // If this is x86-64, and we disabled SSE, we can't return FP values
2658 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2659 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2660 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2661 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2664 // If we prefer to use the value in xmm registers, copy it out as f80 and
2665 // use a truncate to move it from fp stack reg to xmm reg.
2666 bool RoundAfterCopy = false;
2667 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2668 isScalarFPTypeInSSEReg(VA.getValVT())) {
2669 if (!Subtarget.hasX87())
2670 report_fatal_error("X87 register return with X87 disabled");
2672 RoundAfterCopy = (CopyVT != VA.getLocVT());
2676 if (VA.needsCustom()) {
2677 assert(VA.getValVT() == MVT::v64i1 &&
2678 "Currently the only custom case is when we split v64i1 to 2 regs");
2680 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2682 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2684 Val = Chain.getValue(0);
2685 InFlag = Chain.getValue(2);
2689 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2690 // This truncation won't change the value.
2691 DAG.getIntPtrConstant(1, dl));
2693 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2694 if (VA.getValVT().isVector() &&
2695 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2696 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2697 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2698 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2700 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2703 InVals.push_back(Val);
2709 //===----------------------------------------------------------------------===//
2710 // C & StdCall & Fast Calling Convention implementation
2711 //===----------------------------------------------------------------------===//
2712 // StdCall calling convention seems to be standard for many Windows' API
2713 // routines and around. It differs from C calling convention just a little:
2714 // callee should clean up the stack, not caller. Symbols should be also
2715 // decorated in some fancy way :) It doesn't support any vector arguments.
2716 // For info on fast calling convention see Fast Calling Convention (tail call)
2717 // implementation LowerX86_32FastCCCallTo.
2719 /// CallIsStructReturn - Determines whether a call uses struct return
2721 enum StructReturnType {
2726 static StructReturnType
2727 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
2729 return NotStructReturn;
2731 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2732 if (!Flags.isSRet())
2733 return NotStructReturn;
2734 if (Flags.isInReg() || IsMCU)
2735 return RegStructReturn;
2736 return StackStructReturn;
2739 /// Determines whether a function uses struct return semantics.
2740 static StructReturnType
2741 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
2743 return NotStructReturn;
2745 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2746 if (!Flags.isSRet())
2747 return NotStructReturn;
2748 if (Flags.isInReg() || IsMCU)
2749 return RegStructReturn;
2750 return StackStructReturn;
2753 /// Make a copy of an aggregate at address specified by "Src" to address
2754 /// "Dst" with size and alignment information specified by the specific
2755 /// parameter attribute. The copy will be passed as a byval function parameter.
2756 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2757 SDValue Chain, ISD::ArgFlagsTy Flags,
2758 SelectionDAG &DAG, const SDLoc &dl) {
2759 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2761 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2762 /*isVolatile*/false, /*AlwaysInline=*/true,
2763 /*isTailCall*/false,
2764 MachinePointerInfo(), MachinePointerInfo());
2767 /// Return true if the calling convention is one that we can guarantee TCO for.
2768 static bool canGuaranteeTCO(CallingConv::ID CC) {
2769 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2770 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2771 CC == CallingConv::HHVM);
2774 /// Return true if we might ever do TCO for calls with this calling convention.
2775 static bool mayTailCallThisCC(CallingConv::ID CC) {
2777 // C calling conventions:
2778 case CallingConv::C:
2779 case CallingConv::Win64:
2780 case CallingConv::X86_64_SysV:
2781 // Callee pop conventions:
2782 case CallingConv::X86_ThisCall:
2783 case CallingConv::X86_StdCall:
2784 case CallingConv::X86_VectorCall:
2785 case CallingConv::X86_FastCall:
2788 return canGuaranteeTCO(CC);
2792 /// Return true if the function is being made into a tailcall target by
2793 /// changing its ABI.
2794 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2795 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2798 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2800 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2801 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2804 ImmutableCallSite CS(CI);
2805 CallingConv::ID CalleeCC = CS.getCallingConv();
2806 if (!mayTailCallThisCC(CalleeCC))
2813 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2814 const SmallVectorImpl<ISD::InputArg> &Ins,
2815 const SDLoc &dl, SelectionDAG &DAG,
2816 const CCValAssign &VA,
2817 MachineFrameInfo &MFI, unsigned i) const {
2818 // Create the nodes corresponding to a load from this parameter slot.
2819 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2820 bool AlwaysUseMutable = shouldGuaranteeTCO(
2821 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2822 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2824 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2826 // If value is passed by pointer we have address passed instead of the value
2827 // itself. No need to extend if the mask value and location share the same
2829 bool ExtendedInMem =
2830 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2831 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2833 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2834 ValVT = VA.getLocVT();
2836 ValVT = VA.getValVT();
2838 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2839 // taken by a return address.
2841 if (CallConv == CallingConv::X86_INTR) {
2842 // X86 interrupts may take one or two arguments.
2843 // On the stack there will be no return address as in regular call.
2844 // Offset of last argument need to be set to -4/-8 bytes.
2845 // Where offset of the first argument out of two, should be set to 0 bytes.
2846 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2847 if (Subtarget.is64Bit() && Ins.size() == 2) {
2848 // The stack pointer needs to be realigned for 64 bit handlers with error
2849 // code, so the argument offset changes by 8 bytes.
2854 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2855 // changed with more analysis.
2856 // In case of tail call optimization mark all arguments mutable. Since they
2857 // could be overwritten by lowering of arguments in case of a tail call.
2858 if (Flags.isByVal()) {
2859 unsigned Bytes = Flags.getByValSize();
2860 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2862 // FIXME: For now, all byval parameter objects are marked as aliasing. This
2863 // can be improved with deeper analysis.
2864 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
2865 /*isAliased=*/true);
2866 // Adjust SP offset of interrupt parameter.
2867 if (CallConv == CallingConv::X86_INTR) {
2868 MFI.setObjectOffset(FI, Offset);
2870 return DAG.getFrameIndex(FI, PtrVT);
2873 // This is an argument in memory. We might be able to perform copy elision.
2874 if (Flags.isCopyElisionCandidate()) {
2875 EVT ArgVT = Ins[i].ArgVT;
2877 if (Ins[i].PartOffset == 0) {
2878 // If this is a one-part value or the first part of a multi-part value,
2879 // create a stack object for the entire argument value type and return a
2880 // load from our portion of it. This assumes that if the first part of an
2881 // argument is in memory, the rest will also be in memory.
2882 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2883 /*Immutable=*/false);
2884 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2886 ValVT, dl, Chain, PartAddr,
2887 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2889 // This is not the first piece of an argument in memory. See if there is
2890 // already a fixed stack object including this offset. If so, assume it
2891 // was created by the PartOffset == 0 branch above and create a load from
2892 // the appropriate offset into it.
2893 int64_t PartBegin = VA.getLocMemOffset();
2894 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2895 int FI = MFI.getObjectIndexBegin();
2896 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2897 int64_t ObjBegin = MFI.getObjectOffset(FI);
2898 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2899 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2902 if (MFI.isFixedObjectIndex(FI)) {
2904 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2905 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2907 ValVT, dl, Chain, Addr,
2908 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2909 Ins[i].PartOffset));
2914 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2915 VA.getLocMemOffset(), isImmutable);
2917 // Set SExt or ZExt flag.
2918 if (VA.getLocInfo() == CCValAssign::ZExt) {
2919 MFI.setObjectZExt(FI, true);
2920 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2921 MFI.setObjectSExt(FI, true);
2924 // Adjust SP offset of interrupt parameter.
2925 if (CallConv == CallingConv::X86_INTR) {
2926 MFI.setObjectOffset(FI, Offset);
2929 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2930 SDValue Val = DAG.getLoad(
2931 ValVT, dl, Chain, FIN,
2932 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2933 return ExtendedInMem
2934 ? (VA.getValVT().isVector()
2935 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2936 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2940 // FIXME: Get this from tablegen.
2941 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2942 const X86Subtarget &Subtarget) {
2943 assert(Subtarget.is64Bit());
2945 if (Subtarget.isCallingConvWin64(CallConv)) {
2946 static const MCPhysReg GPR64ArgRegsWin64[] = {
2947 X86::RCX, X86::RDX, X86::R8, X86::R9
2949 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2952 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2953 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2955 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2958 // FIXME: Get this from tablegen.
2959 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2960 CallingConv::ID CallConv,
2961 const X86Subtarget &Subtarget) {
2962 assert(Subtarget.is64Bit());
2963 if (Subtarget.isCallingConvWin64(CallConv)) {
2964 // The XMM registers which might contain var arg parameters are shadowed
2965 // in their paired GPR. So we only need to save the GPR to their home
2967 // TODO: __vectorcall will change this.
2971 const Function &F = MF.getFunction();
2972 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2973 bool isSoftFloat = Subtarget.useSoftFloat();
2974 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2975 "SSE register cannot be used when SSE is disabled!");
2976 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2977 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2981 static const MCPhysReg XMMArgRegs64Bit[] = {
2982 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2983 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2985 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2989 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
2990 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2991 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2992 return A.getValNo() < B.getValNo();
2997 SDValue X86TargetLowering::LowerFormalArguments(
2998 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2999 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3000 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3001 MachineFunction &MF = DAG.getMachineFunction();
3002 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3003 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3005 const Function &F = MF.getFunction();
3006 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3007 F.getName() == "main")
3008 FuncInfo->setForceFramePointer(true);
3010 MachineFrameInfo &MFI = MF.getFrameInfo();
3011 bool Is64Bit = Subtarget.is64Bit();
3012 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3015 !(isVarArg && canGuaranteeTCO(CallConv)) &&
3016 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3018 if (CallConv == CallingConv::X86_INTR) {
3019 bool isLegal = Ins.size() == 1 ||
3020 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
3021 (!Is64Bit && Ins[1].VT == MVT::i32)));
3023 report_fatal_error("X86 interrupts may take one or two arguments");
3026 // Assign locations to all of the incoming arguments.
3027 SmallVector<CCValAssign, 16> ArgLocs;
3028 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3030 // Allocate shadow area for Win64.
3032 CCInfo.AllocateStack(32, 8);
3034 CCInfo.AnalyzeArguments(Ins, CC_X86);
3036 // In vectorcall calling convention a second pass is required for the HVA
3038 if (CallingConv::X86_VectorCall == CallConv) {
3039 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3042 // The next loop assumes that the locations are in the same order of the
3044 assert(isSortedByValueNo(ArgLocs) &&
3045 "Argument Location list must be sorted before lowering");
3048 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3050 assert(InsIndex < Ins.size() && "Invalid Ins index");
3051 CCValAssign &VA = ArgLocs[I];
3053 if (VA.isRegLoc()) {
3054 EVT RegVT = VA.getLocVT();
3055 if (VA.needsCustom()) {
3057 VA.getValVT() == MVT::v64i1 &&
3058 "Currently the only custom case is when we split v64i1 to 2 regs");
3060 // v64i1 values, in regcall calling convention, that are
3061 // compiled to 32 bit arch, are split up into two registers.
3063 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3065 const TargetRegisterClass *RC;
3066 if (RegVT == MVT::i8)
3067 RC = &X86::GR8RegClass;
3068 else if (RegVT == MVT::i16)
3069 RC = &X86::GR16RegClass;
3070 else if (RegVT == MVT::i32)
3071 RC = &X86::GR32RegClass;
3072 else if (Is64Bit && RegVT == MVT::i64)
3073 RC = &X86::GR64RegClass;
3074 else if (RegVT == MVT::f32)
3075 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3076 else if (RegVT == MVT::f64)
3077 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3078 else if (RegVT == MVT::f80)
3079 RC = &X86::RFP80RegClass;
3080 else if (RegVT == MVT::f128)
3081 RC = &X86::VR128RegClass;
3082 else if (RegVT.is512BitVector())
3083 RC = &X86::VR512RegClass;
3084 else if (RegVT.is256BitVector())
3085 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3086 else if (RegVT.is128BitVector())
3087 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3088 else if (RegVT == MVT::x86mmx)
3089 RC = &X86::VR64RegClass;
3090 else if (RegVT == MVT::v1i1)
3091 RC = &X86::VK1RegClass;
3092 else if (RegVT == MVT::v8i1)
3093 RC = &X86::VK8RegClass;
3094 else if (RegVT == MVT::v16i1)
3095 RC = &X86::VK16RegClass;
3096 else if (RegVT == MVT::v32i1)
3097 RC = &X86::VK32RegClass;
3098 else if (RegVT == MVT::v64i1)
3099 RC = &X86::VK64RegClass;
3101 llvm_unreachable("Unknown argument type!");
3103 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3104 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3107 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3108 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3110 if (VA.getLocInfo() == CCValAssign::SExt)
3111 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3112 DAG.getValueType(VA.getValVT()));
3113 else if (VA.getLocInfo() == CCValAssign::ZExt)
3114 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3115 DAG.getValueType(VA.getValVT()));
3116 else if (VA.getLocInfo() == CCValAssign::BCvt)
3117 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3119 if (VA.isExtInLoc()) {
3120 // Handle MMX values passed in XMM regs.
3121 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3122 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3123 else if (VA.getValVT().isVector() &&
3124 VA.getValVT().getScalarType() == MVT::i1 &&
3125 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3126 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3127 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3128 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3130 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3133 assert(VA.isMemLoc());
3135 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3138 // If value is passed via pointer - do a load.
3139 if (VA.getLocInfo() == CCValAssign::Indirect)
3141 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3143 InVals.push_back(ArgValue);
3146 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3147 // Swift calling convention does not require we copy the sret argument
3148 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3149 if (CallConv == CallingConv::Swift)
3152 // All x86 ABIs require that for returning structs by value we copy the
3153 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3154 // the argument into a virtual register so that we can access it from the
3156 if (Ins[I].Flags.isSRet()) {
3157 unsigned Reg = FuncInfo->getSRetReturnReg();
3159 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3160 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3161 FuncInfo->setSRetReturnReg(Reg);
3163 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3164 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3169 unsigned StackSize = CCInfo.getNextStackOffset();
3170 // Align stack specially for tail calls.
3171 if (shouldGuaranteeTCO(CallConv,
3172 MF.getTarget().Options.GuaranteedTailCallOpt))
3173 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3175 // If the function takes variable number of arguments, make a frame index for
3176 // the start of the first vararg value... for expansion of llvm.va_start. We
3177 // can skip this if there are no va_start calls.
3178 if (MFI.hasVAStart() &&
3179 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3180 CallConv != CallingConv::X86_ThisCall))) {
3181 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3184 // Figure out if XMM registers are in use.
3185 assert(!(Subtarget.useSoftFloat() &&
3186 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3187 "SSE register cannot be used when SSE is disabled!");
3189 // 64-bit calling conventions support varargs and register parameters, so we
3190 // have to do extra work to spill them in the prologue.
3191 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3192 // Find the first unallocated argument registers.
3193 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3194 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3195 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3196 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3197 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3198 "SSE register cannot be used when SSE is disabled!");
3200 // Gather all the live in physical registers.
3201 SmallVector<SDValue, 6> LiveGPRs;
3202 SmallVector<SDValue, 8> LiveXMMRegs;
3204 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3205 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3207 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3209 if (!ArgXMMs.empty()) {
3210 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3211 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3212 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3213 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3214 LiveXMMRegs.push_back(
3215 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3220 // Get to the caller-allocated home save location. Add 8 to account
3221 // for the return address.
3222 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3223 FuncInfo->setRegSaveFrameIndex(
3224 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3225 // Fixup to set vararg frame on shadow area (4 x i64).
3227 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3229 // For X86-64, if there are vararg parameters that are passed via
3230 // registers, then we must store them to their spots on the stack so
3231 // they may be loaded by dereferencing the result of va_next.
3232 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3233 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3234 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3235 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3238 // Store the integer parameter registers.
3239 SmallVector<SDValue, 8> MemOps;
3240 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3241 getPointerTy(DAG.getDataLayout()));
3242 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3243 for (SDValue Val : LiveGPRs) {
3244 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3245 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3247 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3248 MachinePointerInfo::getFixedStack(
3249 DAG.getMachineFunction(),
3250 FuncInfo->getRegSaveFrameIndex(), Offset));
3251 MemOps.push_back(Store);
3255 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3256 // Now store the XMM (fp + vector) parameter registers.
3257 SmallVector<SDValue, 12> SaveXMMOps;
3258 SaveXMMOps.push_back(Chain);
3259 SaveXMMOps.push_back(ALVal);
3260 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3261 FuncInfo->getRegSaveFrameIndex(), dl));
3262 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3263 FuncInfo->getVarArgsFPOffset(), dl));
3264 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3266 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3267 MVT::Other, SaveXMMOps));
3270 if (!MemOps.empty())
3271 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3274 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3275 // Find the largest legal vector type.
3276 MVT VecVT = MVT::Other;
3277 // FIXME: Only some x86_32 calling conventions support AVX512.
3278 if (Subtarget.hasAVX512() &&
3279 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3280 CallConv == CallingConv::Intel_OCL_BI)))
3281 VecVT = MVT::v16f32;
3282 else if (Subtarget.hasAVX())
3284 else if (Subtarget.hasSSE2())
3287 // We forward some GPRs and some vector types.
3288 SmallVector<MVT, 2> RegParmTypes;
3289 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3290 RegParmTypes.push_back(IntVT);
3291 if (VecVT != MVT::Other)
3292 RegParmTypes.push_back(VecVT);
3294 // Compute the set of forwarded registers. The rest are scratch.
3295 SmallVectorImpl<ForwardedRegister> &Forwards =
3296 FuncInfo->getForwardedMustTailRegParms();
3297 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3299 // Conservatively forward AL on x86_64, since it might be used for varargs.
3300 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3301 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3302 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3305 // Copy all forwards from physical to virtual registers.
3306 for (ForwardedRegister &F : Forwards) {
3307 // FIXME: Can we use a less constrained schedule?
3308 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3309 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3310 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3314 // Some CCs need callee pop.
3315 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3316 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3317 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3318 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3319 // X86 interrupts must pop the error code (and the alignment padding) if
3321 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3323 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3324 // If this is an sret function, the return should pop the hidden pointer.
3325 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3326 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3327 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3328 FuncInfo->setBytesToPopOnReturn(4);
3332 // RegSaveFrameIndex is X86-64 only.
3333 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3334 if (CallConv == CallingConv::X86_FastCall ||
3335 CallConv == CallingConv::X86_ThisCall)
3336 // fastcc functions can't have varargs.
3337 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3340 FuncInfo->setArgumentStackSize(StackSize);
3342 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3343 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3344 if (Personality == EHPersonality::CoreCLR) {
3346 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3347 // that we'd prefer this slot be allocated towards the bottom of the frame
3348 // (i.e. near the stack pointer after allocating the frame). Every
3349 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3350 // offset from the bottom of this and each funclet's frame must be the
3351 // same, so the size of funclets' (mostly empty) frames is dictated by
3352 // how far this slot is from the bottom (since they allocate just enough
3353 // space to accommodate holding this slot at the correct offset).
3354 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3355 EHInfo->PSPSymFrameIdx = PSPSymFI;
3359 if (CallConv == CallingConv::X86_RegCall ||
3360 F.hasFnAttribute("no_caller_saved_registers")) {
3361 MachineRegisterInfo &MRI = MF.getRegInfo();
3362 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3363 MRI.disableCalleeSavedRegister(Pair.first);
3369 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3370 SDValue Arg, const SDLoc &dl,
3372 const CCValAssign &VA,
3373 ISD::ArgFlagsTy Flags) const {
3374 unsigned LocMemOffset = VA.getLocMemOffset();
3375 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3376 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3378 if (Flags.isByVal())
3379 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3381 return DAG.getStore(
3382 Chain, dl, Arg, PtrOff,
3383 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3386 /// Emit a load of return address if tail call
3387 /// optimization is performed and it is required.
3388 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3389 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3390 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3391 // Adjust the Return address stack slot.
3392 EVT VT = getPointerTy(DAG.getDataLayout());
3393 OutRetAddr = getReturnAddressFrameIndex(DAG);
3395 // Load the "old" Return address.
3396 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3397 return SDValue(OutRetAddr.getNode(), 1);
3400 /// Emit a store of the return address if tail call
3401 /// optimization is performed and it is required (FPDiff!=0).
3402 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3403 SDValue Chain, SDValue RetAddrFrIdx,
3404 EVT PtrVT, unsigned SlotSize,
3405 int FPDiff, const SDLoc &dl) {
3406 // Store the return address to the appropriate stack slot.
3407 if (!FPDiff) return Chain;
3408 // Calculate the new stack slot for the return address.
3409 int NewReturnAddrFI =
3410 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3412 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3413 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3414 MachinePointerInfo::getFixedStack(
3415 DAG.getMachineFunction(), NewReturnAddrFI));
3419 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3420 /// operation of specified width.
3421 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3423 unsigned NumElems = VT.getVectorNumElements();
3424 SmallVector<int, 8> Mask;
3425 Mask.push_back(NumElems);
3426 for (unsigned i = 1; i != NumElems; ++i)
3428 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3432 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3433 SmallVectorImpl<SDValue> &InVals) const {
3434 SelectionDAG &DAG = CLI.DAG;
3436 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3437 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3438 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3439 SDValue Chain = CLI.Chain;
3440 SDValue Callee = CLI.Callee;
3441 CallingConv::ID CallConv = CLI.CallConv;
3442 bool &isTailCall = CLI.IsTailCall;
3443 bool isVarArg = CLI.IsVarArg;
3445 MachineFunction &MF = DAG.getMachineFunction();
3446 bool Is64Bit = Subtarget.is64Bit();
3447 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3448 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3449 bool IsSibcall = false;
3450 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3451 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3452 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3453 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3454 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3455 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3456 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
3458 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3459 const Module *M = MF.getMMI().getModule();
3460 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3462 if (CallConv == CallingConv::X86_INTR)
3463 report_fatal_error("X86 interrupts may not be called directly");
3465 if (Attr.getValueAsString() == "true")
3468 if (Subtarget.isPICStyleGOT() &&
3469 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3470 // If we are using a GOT, disable tail calls to external symbols with
3471 // default visibility. Tail calling such a symbol requires using a GOT
3472 // relocation, which forces early binding of the symbol. This breaks code
3473 // that require lazy function symbol resolution. Using musttail or
3474 // GuaranteedTailCallOpt will override this.
3475 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3476 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3477 G->getGlobal()->hasDefaultVisibility()))
3481 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3483 // Force this to be a tail call. The verifier rules are enough to ensure
3484 // that we can lower this successfully without moving the return address
3487 } else if (isTailCall) {
3488 // Check if it's really possible to do a tail call.
3489 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3490 isVarArg, SR != NotStructReturn,
3491 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3492 Outs, OutVals, Ins, DAG);
3494 // Sibcalls are automatically detected tailcalls which do not require
3496 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3503 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3504 "Var args not supported with calling convention fastcc, ghc or hipe");
3506 // Analyze operands of the call, assigning locations to each operand.
3507 SmallVector<CCValAssign, 16> ArgLocs;
3508 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3510 // Allocate shadow area for Win64.
3512 CCInfo.AllocateStack(32, 8);
3514 CCInfo.AnalyzeArguments(Outs, CC_X86);
3516 // In vectorcall calling convention a second pass is required for the HVA
3518 if (CallingConv::X86_VectorCall == CallConv) {
3519 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3522 // Get a count of how many bytes are to be pushed on the stack.
3523 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3525 // This is a sibcall. The memory operands are available in caller's
3526 // own caller's stack.
3528 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3529 canGuaranteeTCO(CallConv))
3530 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3533 if (isTailCall && !IsSibcall && !IsMustTail) {
3534 // Lower arguments at fp - stackoffset + fpdiff.
3535 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3537 FPDiff = NumBytesCallerPushed - NumBytes;
3539 // Set the delta of movement of the returnaddr stackslot.
3540 // But only set if delta is greater than previous delta.
3541 if (FPDiff < X86Info->getTCReturnAddrDelta())
3542 X86Info->setTCReturnAddrDelta(FPDiff);
3545 unsigned NumBytesToPush = NumBytes;
3546 unsigned NumBytesToPop = NumBytes;
3548 // If we have an inalloca argument, all stack space has already been allocated
3549 // for us and be right at the top of the stack. We don't support multiple
3550 // arguments passed in memory when using inalloca.
3551 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3553 if (!ArgLocs.back().isMemLoc())
3554 report_fatal_error("cannot use inalloca attribute on a register "
3556 if (ArgLocs.back().getLocMemOffset() != 0)
3557 report_fatal_error("any parameter with the inalloca attribute must be "
3558 "the only memory argument");
3562 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3563 NumBytes - NumBytesToPush, dl);
3565 SDValue RetAddrFrIdx;
3566 // Load return address for tail calls.
3567 if (isTailCall && FPDiff)
3568 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3569 Is64Bit, FPDiff, dl);
3571 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3572 SmallVector<SDValue, 8> MemOpChains;
3575 // The next loop assumes that the locations are in the same order of the
3577 assert(isSortedByValueNo(ArgLocs) &&
3578 "Argument Location list must be sorted before lowering");
3580 // Walk the register/memloc assignments, inserting copies/loads. In the case
3581 // of tail call optimization arguments are handle later.
3582 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3583 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3585 assert(OutIndex < Outs.size() && "Invalid Out index");
3586 // Skip inalloca arguments, they have already been written.
3587 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3588 if (Flags.isInAlloca())
3591 CCValAssign &VA = ArgLocs[I];
3592 EVT RegVT = VA.getLocVT();
3593 SDValue Arg = OutVals[OutIndex];
3594 bool isByVal = Flags.isByVal();
3596 // Promote the value if needed.
3597 switch (VA.getLocInfo()) {
3598 default: llvm_unreachable("Unknown loc info!");
3599 case CCValAssign::Full: break;
3600 case CCValAssign::SExt:
3601 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3603 case CCValAssign::ZExt:
3604 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3606 case CCValAssign::AExt:
3607 if (Arg.getValueType().isVector() &&
3608 Arg.getValueType().getVectorElementType() == MVT::i1)
3609 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3610 else if (RegVT.is128BitVector()) {
3611 // Special case: passing MMX values in XMM registers.
3612 Arg = DAG.getBitcast(MVT::i64, Arg);
3613 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3614 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3616 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3618 case CCValAssign::BCvt:
3619 Arg = DAG.getBitcast(RegVT, Arg);
3621 case CCValAssign::Indirect: {
3622 // Store the argument.
3623 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3624 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3625 Chain = DAG.getStore(
3626 Chain, dl, Arg, SpillSlot,
3627 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3633 if (VA.needsCustom()) {
3634 assert(VA.getValVT() == MVT::v64i1 &&
3635 "Currently the only custom case is when we split v64i1 to 2 regs");
3636 // Split v64i1 value into two registers
3637 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3639 } else if (VA.isRegLoc()) {
3640 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3641 if (isVarArg && IsWin64) {
3642 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3643 // shadow reg if callee is a varargs function.
3644 unsigned ShadowReg = 0;
3645 switch (VA.getLocReg()) {
3646 case X86::XMM0: ShadowReg = X86::RCX; break;
3647 case X86::XMM1: ShadowReg = X86::RDX; break;
3648 case X86::XMM2: ShadowReg = X86::R8; break;
3649 case X86::XMM3: ShadowReg = X86::R9; break;
3652 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3654 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3655 assert(VA.isMemLoc());
3656 if (!StackPtr.getNode())
3657 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3658 getPointerTy(DAG.getDataLayout()));
3659 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3660 dl, DAG, VA, Flags));
3664 if (!MemOpChains.empty())
3665 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3667 if (Subtarget.isPICStyleGOT()) {
3668 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3671 RegsToPass.push_back(std::make_pair(
3672 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3673 getPointerTy(DAG.getDataLayout()))));
3675 // If we are tail calling and generating PIC/GOT style code load the
3676 // address of the callee into ECX. The value in ecx is used as target of
3677 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3678 // for tail calls on PIC/GOT architectures. Normally we would just put the
3679 // address of GOT into ebx and then call target@PLT. But for tail calls
3680 // ebx would be restored (since ebx is callee saved) before jumping to the
3683 // Note: The actual moving to ECX is done further down.
3684 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3685 if (G && !G->getGlobal()->hasLocalLinkage() &&
3686 G->getGlobal()->hasDefaultVisibility())
3687 Callee = LowerGlobalAddress(Callee, DAG);
3688 else if (isa<ExternalSymbolSDNode>(Callee))
3689 Callee = LowerExternalSymbol(Callee, DAG);
3693 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3694 // From AMD64 ABI document:
3695 // For calls that may call functions that use varargs or stdargs
3696 // (prototype-less calls or calls to functions containing ellipsis (...) in
3697 // the declaration) %al is used as hidden argument to specify the number
3698 // of SSE registers used. The contents of %al do not need to match exactly
3699 // the number of registers, but must be an ubound on the number of SSE
3700 // registers used and is in the range 0 - 8 inclusive.
3702 // Count the number of XMM registers allocated.
3703 static const MCPhysReg XMMArgRegs[] = {
3704 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3705 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3707 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3708 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3709 && "SSE registers cannot be used when SSE is disabled");
3711 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3712 DAG.getConstant(NumXMMRegs, dl,
3716 if (isVarArg && IsMustTail) {
3717 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3718 for (const auto &F : Forwards) {
3719 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3720 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3724 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3725 // don't need this because the eligibility check rejects calls that require
3726 // shuffling arguments passed in memory.
3727 if (!IsSibcall && isTailCall) {
3728 // Force all the incoming stack arguments to be loaded from the stack
3729 // before any new outgoing arguments are stored to the stack, because the
3730 // outgoing stack slots may alias the incoming argument stack slots, and
3731 // the alias isn't otherwise explicit. This is slightly more conservative
3732 // than necessary, because it means that each store effectively depends
3733 // on every argument instead of just those arguments it would clobber.
3734 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3736 SmallVector<SDValue, 8> MemOpChains2;
3739 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3741 CCValAssign &VA = ArgLocs[I];
3743 if (VA.isRegLoc()) {
3744 if (VA.needsCustom()) {
3745 assert((CallConv == CallingConv::X86_RegCall) &&
3746 "Expecting custom case only in regcall calling convention");
3747 // This means that we are in special case where one argument was
3748 // passed through two register locations - Skip the next location
3755 assert(VA.isMemLoc());
3756 SDValue Arg = OutVals[OutsIndex];
3757 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3758 // Skip inalloca arguments. They don't require any work.
3759 if (Flags.isInAlloca())
3761 // Create frame index.
3762 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3763 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3764 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3765 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3767 if (Flags.isByVal()) {
3768 // Copy relative to framepointer.
3769 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3770 if (!StackPtr.getNode())
3771 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3772 getPointerTy(DAG.getDataLayout()));
3773 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3776 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3780 // Store relative to framepointer.
3781 MemOpChains2.push_back(DAG.getStore(
3782 ArgChain, dl, Arg, FIN,
3783 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3787 if (!MemOpChains2.empty())
3788 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3790 // Store the return address to the appropriate stack slot.
3791 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3792 getPointerTy(DAG.getDataLayout()),
3793 RegInfo->getSlotSize(), FPDiff, dl);
3796 // Build a sequence of copy-to-reg nodes chained together with token chain
3797 // and flag operands which copy the outgoing args into registers.
3799 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3800 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3801 RegsToPass[i].second, InFlag);
3802 InFlag = Chain.getValue(1);
3805 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3806 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3807 // In the 64-bit large code model, we have to make all calls
3808 // through a register, since the call instruction's 32-bit
3809 // pc-relative offset may not be large enough to hold the whole
3811 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3812 // If the callee is a GlobalAddress node (quite common, every direct call
3813 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3815 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3817 // We should use extra load for direct calls to dllimported functions in
3819 const GlobalValue *GV = G->getGlobal();
3820 if (!GV->hasDLLImportStorageClass()) {
3821 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3823 Callee = DAG.getTargetGlobalAddress(
3824 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3826 if (OpFlags == X86II::MO_GOTPCREL) {
3828 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3829 getPointerTy(DAG.getDataLayout()), Callee);
3830 // Add extra indirection
3831 Callee = DAG.getLoad(
3832 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3833 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3836 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3837 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3838 unsigned char OpFlags =
3839 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3841 Callee = DAG.getTargetExternalSymbol(
3842 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3844 if (OpFlags == X86II::MO_GOTPCREL) {
3845 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3846 getPointerTy(DAG.getDataLayout()), Callee);
3847 Callee = DAG.getLoad(
3848 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3849 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3851 } else if (Subtarget.isTarget64BitILP32() &&
3852 Callee->getValueType(0) == MVT::i32) {
3853 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3854 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3857 // Returns a chain & a flag for retval copy to use.
3858 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3859 SmallVector<SDValue, 8> Ops;
3861 if (!IsSibcall && isTailCall) {
3862 Chain = DAG.getCALLSEQ_END(Chain,
3863 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3864 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3865 InFlag = Chain.getValue(1);
3868 Ops.push_back(Chain);
3869 Ops.push_back(Callee);
3872 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3874 // Add argument registers to the end of the list so that they are known live
3876 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3877 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3878 RegsToPass[i].second.getValueType()));
3880 // Add a register mask operand representing the call-preserved registers.
3881 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3882 // set X86_INTR calling convention because it has the same CSR mask
3883 // (same preserved registers).
3884 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3885 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3886 assert(Mask && "Missing call preserved mask for calling convention");
3888 // If this is an invoke in a 32-bit function using a funclet-based
3889 // personality, assume the function clobbers all registers. If an exception
3890 // is thrown, the runtime will not restore CSRs.
3891 // FIXME: Model this more precisely so that we can register allocate across
3892 // the normal edge and spill and fill across the exceptional edge.
3893 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3894 const Function &CallerFn = MF.getFunction();
3895 EHPersonality Pers =
3896 CallerFn.hasPersonalityFn()
3897 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3898 : EHPersonality::Unknown;
3899 if (isFuncletEHPersonality(Pers))
3900 Mask = RegInfo->getNoPreservedMask();
3903 // Define a new register mask from the existing mask.
3904 uint32_t *RegMask = nullptr;
3906 // In some calling conventions we need to remove the used physical registers
3907 // from the reg mask.
3908 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3909 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3911 // Allocate a new Reg Mask and copy Mask.
3912 RegMask = MF.allocateRegMask();
3913 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
3914 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
3916 // Make sure all sub registers of the argument registers are reset
3918 for (auto const &RegPair : RegsToPass)
3919 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3920 SubRegs.isValid(); ++SubRegs)
3921 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3923 // Create the RegMask Operand according to our updated mask.
3924 Ops.push_back(DAG.getRegisterMask(RegMask));
3926 // Create the RegMask Operand according to the static mask.
3927 Ops.push_back(DAG.getRegisterMask(Mask));
3930 if (InFlag.getNode())
3931 Ops.push_back(InFlag);
3935 //// If this is the first return lowered for this function, add the regs
3936 //// to the liveout set for the function.
3937 // This isn't right, although it's probably harmless on x86; liveouts
3938 // should be computed from returns not tail calls. Consider a void
3939 // function making a tail call to a function returning int.
3940 MF.getFrameInfo().setHasTailCall();
3941 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3944 if (HasNoCfCheck && IsCFProtectionSupported) {
3945 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
3947 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3949 InFlag = Chain.getValue(1);
3951 // Create the CALLSEQ_END node.
3952 unsigned NumBytesForCalleeToPop;
3953 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3954 DAG.getTarget().Options.GuaranteedTailCallOpt))
3955 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3956 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3957 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3958 SR == StackStructReturn)
3959 // If this is a call to a struct-return function, the callee
3960 // pops the hidden struct pointer, so we have to push it back.
3961 // This is common for Darwin/X86, Linux & Mingw32 targets.
3962 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3963 NumBytesForCalleeToPop = 4;
3965 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3967 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3968 // No need to reset the stack after the call if the call doesn't return. To
3969 // make the MI verify, we'll pretend the callee does it for us.
3970 NumBytesForCalleeToPop = NumBytes;
3973 // Returns a flag for retval copy to use.
3975 Chain = DAG.getCALLSEQ_END(Chain,
3976 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3977 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3980 InFlag = Chain.getValue(1);
3983 // Handle result values, copying them out of physregs into vregs that we
3985 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3989 //===----------------------------------------------------------------------===//
3990 // Fast Calling Convention (tail call) implementation
3991 //===----------------------------------------------------------------------===//
3993 // Like std call, callee cleans arguments, convention except that ECX is
3994 // reserved for storing the tail called function address. Only 2 registers are
3995 // free for argument passing (inreg). Tail call optimization is performed
3997 // * tailcallopt is enabled
3998 // * caller/callee are fastcc
3999 // On X86_64 architecture with GOT-style position independent code only local
4000 // (within module) calls are supported at the moment.
4001 // To keep the stack aligned according to platform abi the function
4002 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
4003 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
4004 // If a tail called function callee has more arguments than the caller the
4005 // caller needs to make sure that there is room to move the RETADDR to. This is
4006 // achieved by reserving an area the size of the argument delta right after the
4007 // original RETADDR, but before the saved framepointer or the spilled registers
4008 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4020 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4023 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
4024 SelectionDAG& DAG) const {
4025 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4026 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
4027 unsigned StackAlignment = TFI.getStackAlignment();
4028 uint64_t AlignMask = StackAlignment - 1;
4029 int64_t Offset = StackSize;
4030 unsigned SlotSize = RegInfo->getSlotSize();
4031 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
4032 // Number smaller than 12 so just add the difference.
4033 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
4035 // Mask out lower bits, add stackalignment once plus the 12 bytes.
4036 Offset = ((~AlignMask) & Offset) + StackAlignment +
4037 (StackAlignment-SlotSize);
4042 /// Return true if the given stack call argument is already available in the
4043 /// same position (relatively) of the caller's incoming argument stack.
4045 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4046 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4047 const X86InstrInfo *TII, const CCValAssign &VA) {
4048 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4051 // Look through nodes that don't alter the bits of the incoming value.
4052 unsigned Op = Arg.getOpcode();
4053 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4054 Arg = Arg.getOperand(0);
4057 if (Op == ISD::TRUNCATE) {
4058 const SDValue &TruncInput = Arg.getOperand(0);
4059 if (TruncInput.getOpcode() == ISD::AssertZext &&
4060 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4061 Arg.getValueType()) {
4062 Arg = TruncInput.getOperand(0);
4070 if (Arg.getOpcode() == ISD::CopyFromReg) {
4071 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4072 if (!TargetRegisterInfo::isVirtualRegister(VR))
4074 MachineInstr *Def = MRI->getVRegDef(VR);
4077 if (!Flags.isByVal()) {
4078 if (!TII->isLoadFromStackSlot(*Def, FI))
4081 unsigned Opcode = Def->getOpcode();
4082 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4083 Opcode == X86::LEA64_32r) &&
4084 Def->getOperand(1).isFI()) {
4085 FI = Def->getOperand(1).getIndex();
4086 Bytes = Flags.getByValSize();
4090 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4091 if (Flags.isByVal())
4092 // ByVal argument is passed in as a pointer but it's now being
4093 // dereferenced. e.g.
4094 // define @foo(%struct.X* %A) {
4095 // tail call @bar(%struct.X* byval %A)
4098 SDValue Ptr = Ld->getBasePtr();
4099 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4102 FI = FINode->getIndex();
4103 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4104 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4105 FI = FINode->getIndex();
4106 Bytes = Flags.getByValSize();
4110 assert(FI != INT_MAX);
4111 if (!MFI.isFixedObjectIndex(FI))
4114 if (Offset != MFI.getObjectOffset(FI))
4117 // If this is not byval, check that the argument stack object is immutable.
4118 // inalloca and argument copy elision can create mutable argument stack
4119 // objects. Byval objects can be mutated, but a byval call intends to pass the
4121 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4124 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4125 // If the argument location is wider than the argument type, check that any
4126 // extension flags match.
4127 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4128 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4133 return Bytes == MFI.getObjectSize(FI);
4136 /// Check whether the call is eligible for tail call optimization. Targets
4137 /// that want to do tail call optimization should implement this function.
4138 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4139 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4140 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4141 const SmallVectorImpl<ISD::OutputArg> &Outs,
4142 const SmallVectorImpl<SDValue> &OutVals,
4143 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4144 if (!mayTailCallThisCC(CalleeCC))
4147 // If -tailcallopt is specified, make fastcc functions tail-callable.
4148 MachineFunction &MF = DAG.getMachineFunction();
4149 const Function &CallerF = MF.getFunction();
4151 // If the function return type is x86_fp80 and the callee return type is not,
4152 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4153 // perform a tailcall optimization here.
4154 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4157 CallingConv::ID CallerCC = CallerF.getCallingConv();
4158 bool CCMatch = CallerCC == CalleeCC;
4159 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4160 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4162 // Win64 functions have extra shadow space for argument homing. Don't do the
4163 // sibcall if the caller and callee have mismatched expectations for this
4165 if (IsCalleeWin64 != IsCallerWin64)
4168 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4169 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4174 // Look for obvious safe cases to perform tail call optimization that do not
4175 // require ABI changes. This is what gcc calls sibcall.
4177 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4178 // emit a special epilogue.
4179 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4180 if (RegInfo->needsStackRealignment(MF))
4183 // Also avoid sibcall optimization if either caller or callee uses struct
4184 // return semantics.
4185 if (isCalleeStructRet || isCallerStructRet)
4188 // Do not sibcall optimize vararg calls unless all arguments are passed via
4190 LLVMContext &C = *DAG.getContext();
4191 if (isVarArg && !Outs.empty()) {
4192 // Optimizing for varargs on Win64 is unlikely to be safe without
4193 // additional testing.
4194 if (IsCalleeWin64 || IsCallerWin64)
4197 SmallVector<CCValAssign, 16> ArgLocs;
4198 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4200 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4201 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4202 if (!ArgLocs[i].isRegLoc())
4206 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4207 // stack. Therefore, if it's not used by the call it is not safe to optimize
4208 // this into a sibcall.
4209 bool Unused = false;
4210 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4217 SmallVector<CCValAssign, 16> RVLocs;
4218 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4219 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4220 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4221 CCValAssign &VA = RVLocs[i];
4222 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4227 // Check that the call results are passed in the same way.
4228 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4229 RetCC_X86, RetCC_X86))
4231 // The callee has to preserve all registers the caller needs to preserve.
4232 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4233 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4235 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4236 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4240 unsigned StackArgsSize = 0;
4242 // If the callee takes no arguments then go on to check the results of the
4244 if (!Outs.empty()) {
4245 // Check if stack adjustment is needed. For now, do not do this if any
4246 // argument is passed on the stack.
4247 SmallVector<CCValAssign, 16> ArgLocs;
4248 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4250 // Allocate shadow area for Win64
4252 CCInfo.AllocateStack(32, 8);
4254 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4255 StackArgsSize = CCInfo.getNextStackOffset();
4257 if (CCInfo.getNextStackOffset()) {
4258 // Check if the arguments are already laid out in the right way as
4259 // the caller's fixed stack objects.
4260 MachineFrameInfo &MFI = MF.getFrameInfo();
4261 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4262 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4263 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4264 CCValAssign &VA = ArgLocs[i];
4265 SDValue Arg = OutVals[i];
4266 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4267 if (VA.getLocInfo() == CCValAssign::Indirect)
4269 if (!VA.isRegLoc()) {
4270 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4277 bool PositionIndependent = isPositionIndependent();
4278 // If the tailcall address may be in a register, then make sure it's
4279 // possible to register allocate for it. In 32-bit, the call address can
4280 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4281 // callee-saved registers are restored. These happen to be the same
4282 // registers used to pass 'inreg' arguments so watch out for those.
4283 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4284 !isa<ExternalSymbolSDNode>(Callee)) ||
4285 PositionIndependent)) {
4286 unsigned NumInRegs = 0;
4287 // In PIC we need an extra register to formulate the address computation
4289 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4291 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4292 CCValAssign &VA = ArgLocs[i];
4295 unsigned Reg = VA.getLocReg();
4298 case X86::EAX: case X86::EDX: case X86::ECX:
4299 if (++NumInRegs == MaxInRegs)
4306 const MachineRegisterInfo &MRI = MF.getRegInfo();
4307 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4311 bool CalleeWillPop =
4312 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4313 MF.getTarget().Options.GuaranteedTailCallOpt);
4315 if (unsigned BytesToPop =
4316 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4317 // If we have bytes to pop, the callee must pop them.
4318 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4319 if (!CalleePopMatches)
4321 } else if (CalleeWillPop && StackArgsSize > 0) {
4322 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4330 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4331 const TargetLibraryInfo *libInfo) const {
4332 return X86::createFastISel(funcInfo, libInfo);
4335 //===----------------------------------------------------------------------===//
4336 // Other Lowering Hooks
4337 //===----------------------------------------------------------------------===//
4339 static bool MayFoldLoad(SDValue Op) {
4340 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4343 static bool MayFoldIntoStore(SDValue Op) {
4344 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4347 static bool MayFoldIntoZeroExtend(SDValue Op) {
4348 if (Op.hasOneUse()) {
4349 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4350 return (ISD::ZERO_EXTEND == Opcode);
4355 static bool isTargetShuffle(unsigned Opcode) {
4357 default: return false;
4358 case X86ISD::BLENDI:
4359 case X86ISD::PSHUFB:
4360 case X86ISD::PSHUFD:
4361 case X86ISD::PSHUFHW:
4362 case X86ISD::PSHUFLW:
4364 case X86ISD::INSERTPS:
4365 case X86ISD::EXTRQI:
4366 case X86ISD::INSERTQI:
4367 case X86ISD::PALIGNR:
4368 case X86ISD::VSHLDQ:
4369 case X86ISD::VSRLDQ:
4370 case X86ISD::MOVLHPS:
4371 case X86ISD::MOVHLPS:
4372 case X86ISD::MOVSHDUP:
4373 case X86ISD::MOVSLDUP:
4374 case X86ISD::MOVDDUP:
4377 case X86ISD::UNPCKL:
4378 case X86ISD::UNPCKH:
4379 case X86ISD::VBROADCAST:
4380 case X86ISD::VPERMILPI:
4381 case X86ISD::VPERMILPV:
4382 case X86ISD::VPERM2X128:
4383 case X86ISD::SHUF128:
4384 case X86ISD::VPERMIL2:
4385 case X86ISD::VPERMI:
4386 case X86ISD::VPPERM:
4387 case X86ISD::VPERMV:
4388 case X86ISD::VPERMV3:
4389 case X86ISD::VZEXT_MOVL:
4394 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4396 default: return false;
4398 case X86ISD::PSHUFB:
4399 case X86ISD::VPERMILPV:
4400 case X86ISD::VPERMIL2:
4401 case X86ISD::VPPERM:
4402 case X86ISD::VPERMV:
4403 case X86ISD::VPERMV3:
4405 // 'Faux' Target Shuffles.
4412 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4413 MachineFunction &MF = DAG.getMachineFunction();
4414 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4415 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4416 int ReturnAddrIndex = FuncInfo->getRAIndex();
4418 if (ReturnAddrIndex == 0) {
4419 // Set up a frame object for the return address.
4420 unsigned SlotSize = RegInfo->getSlotSize();
4421 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4424 FuncInfo->setRAIndex(ReturnAddrIndex);
4427 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4430 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4431 bool hasSymbolicDisplacement) {
4432 // Offset should fit into 32 bit immediate field.
4433 if (!isInt<32>(Offset))
4436 // If we don't have a symbolic displacement - we don't have any extra
4438 if (!hasSymbolicDisplacement)
4441 // FIXME: Some tweaks might be needed for medium code model.
4442 if (M != CodeModel::Small && M != CodeModel::Kernel)
4445 // For small code model we assume that latest object is 16MB before end of 31
4446 // bits boundary. We may also accept pretty large negative constants knowing
4447 // that all objects are in the positive half of address space.
4448 if (M == CodeModel::Small && Offset < 16*1024*1024)
4451 // For kernel code model we know that all object resist in the negative half
4452 // of 32bits address space. We may not accept negative offsets, since they may
4453 // be just off and we may accept pretty large positive ones.
4454 if (M == CodeModel::Kernel && Offset >= 0)
4460 /// Determines whether the callee is required to pop its own arguments.
4461 /// Callee pop is necessary to support tail calls.
4462 bool X86::isCalleePop(CallingConv::ID CallingConv,
4463 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4464 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4465 // can guarantee TCO.
4466 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4469 switch (CallingConv) {
4472 case CallingConv::X86_StdCall:
4473 case CallingConv::X86_FastCall:
4474 case CallingConv::X86_ThisCall:
4475 case CallingConv::X86_VectorCall:
4480 /// Return true if the condition is an unsigned comparison operation.
4481 static bool isX86CCUnsigned(unsigned X86CC) {
4484 llvm_unreachable("Invalid integer condition!");
4500 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4501 switch (SetCCOpcode) {
4502 default: llvm_unreachable("Invalid integer condition!");
4503 case ISD::SETEQ: return X86::COND_E;
4504 case ISD::SETGT: return X86::COND_G;
4505 case ISD::SETGE: return X86::COND_GE;
4506 case ISD::SETLT: return X86::COND_L;
4507 case ISD::SETLE: return X86::COND_LE;
4508 case ISD::SETNE: return X86::COND_NE;
4509 case ISD::SETULT: return X86::COND_B;
4510 case ISD::SETUGT: return X86::COND_A;
4511 case ISD::SETULE: return X86::COND_BE;
4512 case ISD::SETUGE: return X86::COND_AE;
4516 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4517 /// condition code, returning the condition code and the LHS/RHS of the
4518 /// comparison to make.
4519 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4520 bool isFP, SDValue &LHS, SDValue &RHS,
4521 SelectionDAG &DAG) {
4523 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4524 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4525 // X > -1 -> X == 0, jump !sign.
4526 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4527 return X86::COND_NS;
4529 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4530 // X < 0 -> X == 0, jump on sign.
4533 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4535 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4536 return X86::COND_LE;
4540 return TranslateIntegerX86CC(SetCCOpcode);
4543 // First determine if it is required or is profitable to flip the operands.
4545 // If LHS is a foldable load, but RHS is not, flip the condition.
4546 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4547 !ISD::isNON_EXTLoad(RHS.getNode())) {
4548 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4549 std::swap(LHS, RHS);
4552 switch (SetCCOpcode) {
4558 std::swap(LHS, RHS);
4562 // On a floating point condition, the flags are set as follows:
4564 // 0 | 0 | 0 | X > Y
4565 // 0 | 0 | 1 | X < Y
4566 // 1 | 0 | 0 | X == Y
4567 // 1 | 1 | 1 | unordered
4568 switch (SetCCOpcode) {
4569 default: llvm_unreachable("Condcode should be pre-legalized away");
4571 case ISD::SETEQ: return X86::COND_E;
4572 case ISD::SETOLT: // flipped
4574 case ISD::SETGT: return X86::COND_A;
4575 case ISD::SETOLE: // flipped
4577 case ISD::SETGE: return X86::COND_AE;
4578 case ISD::SETUGT: // flipped
4580 case ISD::SETLT: return X86::COND_B;
4581 case ISD::SETUGE: // flipped
4583 case ISD::SETLE: return X86::COND_BE;
4585 case ISD::SETNE: return X86::COND_NE;
4586 case ISD::SETUO: return X86::COND_P;
4587 case ISD::SETO: return X86::COND_NP;
4589 case ISD::SETUNE: return X86::COND_INVALID;
4593 /// Is there a floating point cmov for the specific X86 condition code?
4594 /// Current x86 isa includes the following FP cmov instructions:
4595 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4596 static bool hasFPCMov(unsigned X86CC) {
4613 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4615 MachineFunction &MF,
4616 unsigned Intrinsic) const {
4618 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4622 Info.opc = ISD::INTRINSIC_W_CHAIN;
4623 Info.flags = MachineMemOperand::MONone;
4626 switch (IntrData->Type) {
4627 case TRUNCATE_TO_MEM_VI8:
4628 case TRUNCATE_TO_MEM_VI16:
4629 case TRUNCATE_TO_MEM_VI32: {
4630 Info.ptrVal = I.getArgOperand(0);
4631 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4632 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4633 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4635 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4636 ScalarVT = MVT::i16;
4637 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4638 ScalarVT = MVT::i32;
4640 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4642 Info.flags |= MachineMemOperand::MOStore;
4652 /// Returns true if the target can instruction select the
4653 /// specified FP immediate natively. If false, the legalizer will
4654 /// materialize the FP immediate as a load from a constant pool.
4655 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4656 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4657 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4663 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4664 ISD::LoadExtType ExtTy,
4666 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4667 // relocation target a movq or addq instruction: don't let the load shrink.
4668 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4669 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4670 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4671 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4675 /// Returns true if it is beneficial to convert a load of a constant
4676 /// to just the constant itself.
4677 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4679 assert(Ty->isIntegerTy());
4681 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4682 if (BitSize == 0 || BitSize > 64)
4687 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4688 // TODO: It might be a win to ease or lift this restriction, but the generic
4689 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4690 if (VT.isVector() && Subtarget.hasAVX512())
4696 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4697 unsigned Index) const {
4698 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4701 // Mask vectors support all subregister combinations and operations that
4702 // extract half of vector.
4703 if (ResVT.getVectorElementType() == MVT::i1)
4704 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4705 (Index == ResVT.getVectorNumElements()));
4707 return (Index % ResVT.getVectorNumElements()) == 0;
4710 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4711 // Speculate cttz only if we can directly use TZCNT.
4712 return Subtarget.hasBMI();
4715 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4716 // Speculate ctlz only if we can directly use LZCNT.
4717 return Subtarget.hasLZCNT();
4720 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4721 EVT BitcastVT) const {
4722 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
4725 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4728 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4729 const SelectionDAG &DAG) const {
4730 // Do not merge to float value size (128 bytes) if no implicit
4731 // float attribute is set.
4732 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4733 Attribute::NoImplicitFloat);
4736 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4737 return (MemVT.getSizeInBits() <= MaxIntSize);
4742 bool X86TargetLowering::isCtlzFast() const {
4743 return Subtarget.hasFastLZCNT();
4746 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4747 const Instruction &AndI) const {
4751 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4752 EVT VT = Y.getValueType();
4757 if (!Subtarget.hasBMI())
4760 // There are only 32-bit and 64-bit forms for 'andn'.
4761 if (VT != MVT::i32 && VT != MVT::i64)
4764 // A mask and compare against constant is ok for an 'andn' too
4765 // even though the BMI instruction doesn't have an immediate form.
4770 bool X86TargetLowering::hasAndNot(SDValue Y) const {
4771 EVT VT = Y.getValueType();
4773 if (!VT.isVector()) // x86 can't form 'andn' with an immediate.
4774 return !isa<ConstantSDNode>(Y) && hasAndNotCompare(Y);
4778 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
4781 if (VT == MVT::v4i32)
4784 return Subtarget.hasSSE2();
4787 bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
4788 EVT VT = Y.getValueType();
4790 // For vectors, we don't have a preference, but we probably want a mask.
4794 // 64-bit shifts on 32-bit targets produce really bad bloated code.
4795 if (VT == MVT::i64 && !Subtarget.is64Bit())
4801 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4802 MVT VT = MVT::getIntegerVT(NumBits);
4803 if (isTypeLegal(VT))
4806 // PMOVMSKB can handle this.
4807 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4810 // VPMOVMSKB can handle this.
4811 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4814 // TODO: Allow 64-bit type for 32-bit target.
4815 // TODO: 512-bit types should be allowed, but make sure that those
4816 // cases are handled in combineVectorSizedSetCCEquality().
4818 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4821 /// Val is the undef sentinel value or equal to the specified value.
4822 static bool isUndefOrEqual(int Val, int CmpVal) {
4823 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4826 /// Val is either the undef or zero sentinel value.
4827 static bool isUndefOrZero(int Val) {
4828 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4831 /// Return true if every element in Mask, beginning
4832 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4833 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4834 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4835 if (Mask[i] != SM_SentinelUndef)
4840 /// Return true if Val falls within the specified range (L, H].
4841 static bool isInRange(int Val, int Low, int Hi) {
4842 return (Val >= Low && Val < Hi);
4845 /// Return true if the value of any element in Mask falls within the specified
4847 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
4849 if (isInRange(M, Low, Hi))
4854 /// Return true if Val is undef or if its value falls within the
4855 /// specified range (L, H].
4856 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4857 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
4860 /// Return true if every element in Mask is undef or if its value
4861 /// falls within the specified range (L, H].
4862 static bool isUndefOrInRange(ArrayRef<int> Mask,
4865 if (!isUndefOrInRange(M, Low, Hi))
4870 /// Return true if Val is undef, zero or if its value falls within the
4871 /// specified range (L, H].
4872 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4873 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
4876 /// Return true if every element in Mask is undef, zero or if its value
4877 /// falls within the specified range (L, H].
4878 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4880 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4885 /// Return true if every element in Mask, beginning
4886 /// from position Pos and ending in Pos + Size, falls within the specified
4887 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
4888 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
4889 unsigned Size, int Low, int Step = 1) {
4890 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
4891 if (!isUndefOrEqual(Mask[i], Low))
4896 /// Return true if every element in Mask, beginning
4897 /// from position Pos and ending in Pos+Size, falls within the specified
4898 /// sequential range (Low, Low+Size], or is undef or is zero.
4899 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4900 unsigned Size, int Low) {
4901 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4902 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4907 /// Return true if every element in Mask, beginning
4908 /// from position Pos and ending in Pos+Size is undef or is zero.
4909 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4911 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4912 if (!isUndefOrZero(Mask[i]))
4917 /// Helper function to test whether a shuffle mask could be
4918 /// simplified by widening the elements being shuffled.
4920 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4921 /// leaves it in an unspecified state.
4923 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4924 /// shuffle masks. The latter have the special property of a '-2' representing
4925 /// a zero-ed lane of a vector.
4926 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4927 SmallVectorImpl<int> &WidenedMask) {
4928 WidenedMask.assign(Mask.size() / 2, 0);
4929 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4931 int M1 = Mask[i + 1];
4933 // If both elements are undef, its trivial.
4934 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4935 WidenedMask[i / 2] = SM_SentinelUndef;
4939 // Check for an undef mask and a mask value properly aligned to fit with
4940 // a pair of values. If we find such a case, use the non-undef mask's value.
4941 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4942 WidenedMask[i / 2] = M1 / 2;
4945 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4946 WidenedMask[i / 2] = M0 / 2;
4950 // When zeroing, we need to spread the zeroing across both lanes to widen.
4951 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4952 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4953 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4954 WidenedMask[i / 2] = SM_SentinelZero;
4960 // Finally check if the two mask values are adjacent and aligned with
4962 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4963 WidenedMask[i / 2] = M0 / 2;
4967 // Otherwise we can't safely widen the elements used in this shuffle.
4970 assert(WidenedMask.size() == Mask.size() / 2 &&
4971 "Incorrect size of mask after widening the elements!");
4976 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4977 const APInt &Zeroable,
4978 SmallVectorImpl<int> &WidenedMask) {
4979 SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
4980 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
4981 if (TargetMask[i] == SM_SentinelUndef)
4984 TargetMask[i] = SM_SentinelZero;
4986 return canWidenShuffleElements(TargetMask, WidenedMask);
4989 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
4990 SmallVector<int, 32> WidenedMask;
4991 return canWidenShuffleElements(Mask, WidenedMask);
4994 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4995 bool X86::isZeroNode(SDValue Elt) {
4996 return isNullConstant(Elt) || isNullFPConstant(Elt);
4999 // Build a vector of constants.
5000 // Use an UNDEF node if MaskElt == -1.
5001 // Split 64-bit constants in the 32-bit mode.
5002 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5003 const SDLoc &dl, bool IsMask = false) {
5005 SmallVector<SDValue, 32> Ops;
5008 MVT ConstVecVT = VT;
5009 unsigned NumElts = VT.getVectorNumElements();
5010 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5011 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5012 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5016 MVT EltVT = ConstVecVT.getVectorElementType();
5017 for (unsigned i = 0; i < NumElts; ++i) {
5018 bool IsUndef = Values[i] < 0 && IsMask;
5019 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5020 DAG.getConstant(Values[i], dl, EltVT);
5021 Ops.push_back(OpNode);
5023 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5024 DAG.getConstant(0, dl, EltVT));
5026 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5028 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5032 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5033 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5034 assert(Bits.size() == Undefs.getBitWidth() &&
5035 "Unequal constant and undef arrays");
5036 SmallVector<SDValue, 32> Ops;
5039 MVT ConstVecVT = VT;
5040 unsigned NumElts = VT.getVectorNumElements();
5041 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5042 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5043 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5047 MVT EltVT = ConstVecVT.getVectorElementType();
5048 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5050 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5053 const APInt &V = Bits[i];
5054 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5056 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5057 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5058 } else if (EltVT == MVT::f32) {
5059 APFloat FV(APFloat::IEEEsingle(), V);
5060 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5061 } else if (EltVT == MVT::f64) {
5062 APFloat FV(APFloat::IEEEdouble(), V);
5063 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5065 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5069 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5070 return DAG.getBitcast(VT, ConstsNode);
5073 /// Returns a vector of specified type with all zero elements.
5074 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5075 SelectionDAG &DAG, const SDLoc &dl) {
5076 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5077 VT.getVectorElementType() == MVT::i1) &&
5078 "Unexpected vector type");
5080 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5081 // type. This ensures they get CSE'd. But if the integer type is not
5082 // available, use a floating-point +0.0 instead.
5084 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5085 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5086 } else if (VT.getVectorElementType() == MVT::i1) {
5087 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5088 "Unexpected vector type");
5089 Vec = DAG.getConstant(0, dl, VT);
5091 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5092 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5094 return DAG.getBitcast(VT, Vec);
5097 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5098 const SDLoc &dl, unsigned vectorWidth) {
5099 EVT VT = Vec.getValueType();
5100 EVT ElVT = VT.getVectorElementType();
5101 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5102 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5103 VT.getVectorNumElements()/Factor);
5105 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5106 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5107 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5109 // This is the index of the first element of the vectorWidth-bit chunk
5110 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5111 IdxVal &= ~(ElemsPerChunk - 1);
5113 // If the input is a buildvector just emit a smaller one.
5114 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5115 return DAG.getBuildVector(ResultVT, dl,
5116 Vec->ops().slice(IdxVal, ElemsPerChunk));
5118 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5119 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5122 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5123 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5124 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5125 /// instructions or a simple subregister reference. Idx is an index in the
5126 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5127 /// lowering EXTRACT_VECTOR_ELT operations easier.
5128 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5129 SelectionDAG &DAG, const SDLoc &dl) {
5130 assert((Vec.getValueType().is256BitVector() ||
5131 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5132 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5135 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5136 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5137 SelectionDAG &DAG, const SDLoc &dl) {
5138 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5139 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5142 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5143 SelectionDAG &DAG, const SDLoc &dl,
5144 unsigned vectorWidth) {
5145 assert((vectorWidth == 128 || vectorWidth == 256) &&
5146 "Unsupported vector width");
5147 // Inserting UNDEF is Result
5150 EVT VT = Vec.getValueType();
5151 EVT ElVT = VT.getVectorElementType();
5152 EVT ResultVT = Result.getValueType();
5154 // Insert the relevant vectorWidth bits.
5155 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5156 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5158 // This is the index of the first element of the vectorWidth-bit chunk
5159 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5160 IdxVal &= ~(ElemsPerChunk - 1);
5162 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5163 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5166 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5167 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5168 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5169 /// simple superregister reference. Idx is an index in the 128 bits
5170 /// we want. It need not be aligned to a 128-bit boundary. That makes
5171 /// lowering INSERT_VECTOR_ELT operations easier.
5172 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5173 SelectionDAG &DAG, const SDLoc &dl) {
5174 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5175 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5178 /// Widen a vector to a larger size with the same scalar type, with the new
5179 /// elements either zero or undef.
5180 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5181 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5183 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
5184 Vec.getValueType().getScalarType() == VT.getScalarType() &&
5185 "Unsupported vector widening type");
5186 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5188 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5189 DAG.getIntPtrConstant(0, dl));
5192 // Helper for splitting operands of an operation to legal target size and
5193 // apply a function on each part.
5194 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5195 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5196 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5197 // The argument Builder is a function that will be applied on each split part:
5198 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5199 template <typename F>
5200 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5201 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5202 F Builder, bool CheckBWI = true) {
5203 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
5204 unsigned NumSubs = 1;
5205 if ((CheckBWI && Subtarget.useBWIRegs()) ||
5206 (!CheckBWI && Subtarget.useAVX512Regs())) {
5207 if (VT.getSizeInBits() > 512) {
5208 NumSubs = VT.getSizeInBits() / 512;
5209 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
5211 } else if (Subtarget.hasAVX2()) {
5212 if (VT.getSizeInBits() > 256) {
5213 NumSubs = VT.getSizeInBits() / 256;
5214 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
5217 if (VT.getSizeInBits() > 128) {
5218 NumSubs = VT.getSizeInBits() / 128;
5219 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
5224 return Builder(DAG, DL, Ops);
5226 SmallVector<SDValue, 4> Subs;
5227 for (unsigned i = 0; i != NumSubs; ++i) {
5228 SmallVector<SDValue, 2> SubOps;
5229 for (SDValue Op : Ops) {
5230 EVT OpVT = Op.getValueType();
5231 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5232 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5233 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5235 Subs.push_back(Builder(DAG, DL, SubOps));
5237 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5240 // Return true if the instruction zeroes the unused upper part of the
5241 // destination and accepts mask.
5242 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5247 case X86ISD::CMPM_RND:
5253 /// Insert i1-subvector to i1-vector.
5254 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5255 const X86Subtarget &Subtarget) {
5258 SDValue Vec = Op.getOperand(0);
5259 SDValue SubVec = Op.getOperand(1);
5260 SDValue Idx = Op.getOperand(2);
5262 if (!isa<ConstantSDNode>(Idx))
5265 // Inserting undef is a nop. We can just return the original vector.
5266 if (SubVec.isUndef())
5269 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5270 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5273 MVT OpVT = Op.getSimpleValueType();
5274 unsigned NumElems = OpVT.getVectorNumElements();
5276 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5278 // Extend to natively supported kshift.
5279 MVT WideOpVT = OpVT;
5280 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5281 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5283 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5285 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5286 // May need to promote to a legal type.
5287 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5288 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5290 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5293 MVT SubVecVT = SubVec.getSimpleValueType();
5294 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5296 assert(IdxVal + SubVecNumElems <= NumElems &&
5297 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5298 "Unexpected index value in INSERT_SUBVECTOR");
5300 SDValue Undef = DAG.getUNDEF(WideOpVT);
5303 // Zero lower bits of the Vec
5304 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5305 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5307 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5308 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5309 // Merge them together, SubVec should be zero extended.
5310 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5311 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5313 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5314 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5317 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5318 Undef, SubVec, ZeroIdx);
5320 if (Vec.isUndef()) {
5321 assert(IdxVal != 0 && "Unexpected index");
5322 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5323 DAG.getConstant(IdxVal, dl, MVT::i8));
5324 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5327 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5328 assert(IdxVal != 0 && "Unexpected index");
5329 NumElems = WideOpVT.getVectorNumElements();
5330 unsigned ShiftLeft = NumElems - SubVecNumElems;
5331 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5332 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5333 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5334 if (ShiftRight != 0)
5335 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5336 DAG.getConstant(ShiftRight, dl, MVT::i8));
5337 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5340 // Simple case when we put subvector in the upper part
5341 if (IdxVal + SubVecNumElems == NumElems) {
5342 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5343 DAG.getConstant(IdxVal, dl, MVT::i8));
5344 if (SubVecNumElems * 2 == NumElems) {
5345 // Special case, use legal zero extending insert_subvector. This allows
5346 // isel to opimitize when bits are known zero.
5347 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5348 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5349 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5352 // Otherwise use explicit shifts to zero the bits.
5353 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5354 Undef, Vec, ZeroIdx);
5355 NumElems = WideOpVT.getVectorNumElements();
5356 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5357 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5358 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5360 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5361 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5364 // Inserting into the middle is more complicated.
5366 NumElems = WideOpVT.getVectorNumElements();
5368 // Widen the vector if needed.
5369 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5370 // Move the current value of the bit to be replace to the lsbs.
5371 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5372 DAG.getConstant(IdxVal, dl, MVT::i8));
5373 // Xor with the new bit.
5374 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5375 // Shift to MSB, filling bottom bits with 0.
5376 unsigned ShiftLeft = NumElems - SubVecNumElems;
5377 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5378 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5379 // Shift to the final position, filling upper bits with 0.
5380 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5381 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5382 DAG.getConstant(ShiftRight, dl, MVT::i8));
5383 // Xor with original vector leaving the new value.
5384 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5385 // Reduce to original width if needed.
5386 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5389 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
5390 unsigned NumElems, SelectionDAG &DAG,
5391 const SDLoc &dl, unsigned VectorWidth) {
5392 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
5393 return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
5396 /// Returns a vector of specified type with all bits set.
5397 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5398 /// Then bitcast to their original type, ensuring they get CSE'd.
5399 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5400 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5401 "Expected a 128/256/512-bit vector type");
5403 APInt Ones = APInt::getAllOnesValue(32);
5404 unsigned NumElts = VT.getSizeInBits() / 32;
5405 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5406 return DAG.getBitcast(VT, Vec);
5409 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5410 SelectionDAG &DAG) {
5411 EVT InVT = In.getValueType();
5412 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5414 if (VT.is128BitVector() && InVT.is128BitVector())
5415 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5416 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5418 // For 256-bit vectors, we only need the lower (128-bit) input half.
5419 // For 512-bit vectors, we only need the lower input half or quarter.
5420 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5421 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5422 In = extractSubVector(In, 0, DAG, DL,
5423 std::max(128, (int)VT.getSizeInBits() / Scale));
5426 return DAG.getNode(Opc, DL, VT, In);
5429 /// Returns a vector_shuffle node for an unpackl operation.
5430 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5431 SDValue V1, SDValue V2) {
5432 SmallVector<int, 8> Mask;
5433 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5434 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5437 /// Returns a vector_shuffle node for an unpackh operation.
5438 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5439 SDValue V1, SDValue V2) {
5440 SmallVector<int, 8> Mask;
5441 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5442 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5445 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5446 /// This produces a shuffle where the low element of V2 is swizzled into the
5447 /// zero/undef vector, landing at element Idx.
5448 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5449 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5451 const X86Subtarget &Subtarget,
5452 SelectionDAG &DAG) {
5453 MVT VT = V2.getSimpleValueType();
5455 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5456 int NumElems = VT.getVectorNumElements();
5457 SmallVector<int, 16> MaskVec(NumElems);
5458 for (int i = 0; i != NumElems; ++i)
5459 // If this is the insertion idx, put the low elt of V2 here.
5460 MaskVec[i] = (i == Idx) ? NumElems : i;
5461 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5464 static SDValue peekThroughBitcasts(SDValue V) {
5465 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5466 V = V.getOperand(0);
5470 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5471 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5472 V.getOperand(0).hasOneUse())
5473 V = V.getOperand(0);
5477 // Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
5478 static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
5479 while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
5480 V = V.getOperand(0);
5484 static const Constant *getTargetConstantFromNode(SDValue Op) {
5485 Op = peekThroughBitcasts(Op);
5487 auto *Load = dyn_cast<LoadSDNode>(Op);
5491 SDValue Ptr = Load->getBasePtr();
5492 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5493 Ptr->getOpcode() == X86ISD::WrapperRIP)
5494 Ptr = Ptr->getOperand(0);
5496 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5497 if (!CNode || CNode->isMachineConstantPoolEntry())
5500 return dyn_cast<Constant>(CNode->getConstVal());
5503 // Extract raw constant bits from constant pools.
5504 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5506 SmallVectorImpl<APInt> &EltBits,
5507 bool AllowWholeUndefs = true,
5508 bool AllowPartialUndefs = true) {
5509 assert(EltBits.empty() && "Expected an empty EltBits vector");
5511 Op = peekThroughBitcasts(Op);
5513 EVT VT = Op.getValueType();
5514 unsigned SizeInBits = VT.getSizeInBits();
5515 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5516 unsigned NumElts = SizeInBits / EltSizeInBits;
5518 // Bitcast a source array of element bits to the target size.
5519 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5520 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5521 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5522 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5523 "Constant bit sizes don't match");
5525 // Don't split if we don't allow undef bits.
5526 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5527 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5530 // If we're already the right size, don't bother bitcasting.
5531 if (NumSrcElts == NumElts) {
5532 UndefElts = UndefSrcElts;
5533 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5537 // Extract all the undef/constant element data and pack into single bitsets.
5538 APInt UndefBits(SizeInBits, 0);
5539 APInt MaskBits(SizeInBits, 0);
5541 for (unsigned i = 0; i != NumSrcElts; ++i) {
5542 unsigned BitOffset = i * SrcEltSizeInBits;
5543 if (UndefSrcElts[i])
5544 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5545 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5548 // Split the undef/constant single bitset data into the target elements.
5549 UndefElts = APInt(NumElts, 0);
5550 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5552 for (unsigned i = 0; i != NumElts; ++i) {
5553 unsigned BitOffset = i * EltSizeInBits;
5554 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5556 // Only treat an element as UNDEF if all bits are UNDEF.
5557 if (UndefEltBits.isAllOnesValue()) {
5558 if (!AllowWholeUndefs)
5560 UndefElts.setBit(i);
5564 // If only some bits are UNDEF then treat them as zero (or bail if not
5566 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5569 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5570 EltBits[i] = Bits.getZExtValue();
5575 // Collect constant bits and insert into mask/undef bit masks.
5576 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5577 unsigned UndefBitIndex) {
5580 if (isa<UndefValue>(Cst)) {
5581 Undefs.setBit(UndefBitIndex);
5584 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5585 Mask = CInt->getValue();
5588 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5589 Mask = CFP->getValueAPF().bitcastToAPInt();
5597 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5598 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5599 return CastBitData(UndefSrcElts, SrcEltBits);
5602 // Extract scalar constant bits.
5603 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5604 APInt UndefSrcElts = APInt::getNullValue(1);
5605 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5606 return CastBitData(UndefSrcElts, SrcEltBits);
5608 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5609 APInt UndefSrcElts = APInt::getNullValue(1);
5610 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5611 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5612 return CastBitData(UndefSrcElts, SrcEltBits);
5615 // Extract constant bits from build vector.
5616 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5617 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5618 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5620 APInt UndefSrcElts(NumSrcElts, 0);
5621 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5622 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5623 const SDValue &Src = Op.getOperand(i);
5624 if (Src.isUndef()) {
5625 UndefSrcElts.setBit(i);
5628 auto *Cst = cast<ConstantSDNode>(Src);
5629 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5631 return CastBitData(UndefSrcElts, SrcEltBits);
5634 // Extract constant bits from constant pool vector.
5635 if (auto *Cst = getTargetConstantFromNode(Op)) {
5636 Type *CstTy = Cst->getType();
5637 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5640 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5641 unsigned NumSrcElts = CstTy->getVectorNumElements();
5643 APInt UndefSrcElts(NumSrcElts, 0);
5644 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5645 for (unsigned i = 0; i != NumSrcElts; ++i)
5646 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5650 return CastBitData(UndefSrcElts, SrcEltBits);
5653 // Extract constant bits from a broadcasted constant pool scalar.
5654 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5655 EltSizeInBits <= VT.getScalarSizeInBits()) {
5656 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5657 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5658 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5660 APInt UndefSrcElts(NumSrcElts, 0);
5661 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5662 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5663 if (UndefSrcElts[0])
5664 UndefSrcElts.setBits(0, NumSrcElts);
5665 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5666 return CastBitData(UndefSrcElts, SrcEltBits);
5671 // Extract a rematerialized scalar constant insertion.
5672 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5673 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5674 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5675 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5676 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5678 APInt UndefSrcElts(NumSrcElts, 0);
5679 SmallVector<APInt, 64> SrcEltBits;
5680 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5681 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5682 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5683 return CastBitData(UndefSrcElts, SrcEltBits);
5689 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5690 unsigned MaskEltSizeInBits,
5691 SmallVectorImpl<uint64_t> &RawMask) {
5693 SmallVector<APInt, 64> EltBits;
5695 // Extract the raw target constant bits.
5696 // FIXME: We currently don't support UNDEF bits or mask entries.
5697 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5698 EltBits, /* AllowWholeUndefs */ false,
5699 /* AllowPartialUndefs */ false))
5702 // Insert the extracted elements into the mask.
5703 for (APInt Elt : EltBits)
5704 RawMask.push_back(Elt.getZExtValue());
5709 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5710 /// Note: This ignores saturation, so inputs must be checked first.
5711 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5713 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5714 unsigned NumElts = VT.getVectorNumElements();
5715 unsigned NumLanes = VT.getSizeInBits() / 128;
5716 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5717 unsigned Offset = Unary ? 0 : NumElts;
5719 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5720 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5721 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5722 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5723 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5727 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5728 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5729 /// operands in \p Ops, and returns true.
5730 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5731 /// IsUnary for shuffles which use a single input multiple times, and in those
5732 /// cases it will adjust the mask to only have indices within that single input.
5733 /// It is an error to call this with non-empty Mask/Ops vectors.
5734 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5735 SmallVectorImpl<SDValue> &Ops,
5736 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5737 unsigned NumElems = VT.getVectorNumElements();
5740 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5741 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5744 bool IsFakeUnary = false;
5745 switch(N->getOpcode()) {
5746 case X86ISD::BLENDI:
5747 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5748 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5749 ImmN = N->getOperand(N->getNumOperands()-1);
5750 DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5751 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5754 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5755 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5756 ImmN = N->getOperand(N->getNumOperands()-1);
5757 DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
5758 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5759 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5761 case X86ISD::INSERTPS:
5762 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5763 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5764 ImmN = N->getOperand(N->getNumOperands()-1);
5765 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5766 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5768 case X86ISD::EXTRQI:
5769 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5770 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5771 isa<ConstantSDNode>(N->getOperand(2))) {
5772 int BitLen = N->getConstantOperandVal(1);
5773 int BitIdx = N->getConstantOperandVal(2);
5774 DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5779 case X86ISD::INSERTQI:
5780 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5781 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5782 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5783 isa<ConstantSDNode>(N->getOperand(3))) {
5784 int BitLen = N->getConstantOperandVal(2);
5785 int BitIdx = N->getConstantOperandVal(3);
5786 DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5788 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5791 case X86ISD::UNPCKH:
5792 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5793 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5794 DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
5795 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5797 case X86ISD::UNPCKL:
5798 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5799 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5800 DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
5801 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5803 case X86ISD::MOVHLPS:
5804 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5805 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5806 DecodeMOVHLPSMask(NumElems, Mask);
5807 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5809 case X86ISD::MOVLHPS:
5810 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5811 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5812 DecodeMOVLHPSMask(NumElems, Mask);
5813 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5815 case X86ISD::PALIGNR:
5816 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5817 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5818 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5819 ImmN = N->getOperand(N->getNumOperands()-1);
5820 DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5822 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5823 Ops.push_back(N->getOperand(1));
5824 Ops.push_back(N->getOperand(0));
5826 case X86ISD::VSHLDQ:
5827 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5828 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5829 ImmN = N->getOperand(N->getNumOperands() - 1);
5830 DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5834 case X86ISD::VSRLDQ:
5835 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5836 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5837 ImmN = N->getOperand(N->getNumOperands() - 1);
5838 DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5842 case X86ISD::PSHUFD:
5843 case X86ISD::VPERMILPI:
5844 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5845 ImmN = N->getOperand(N->getNumOperands()-1);
5846 DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
5847 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5850 case X86ISD::PSHUFHW:
5851 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5852 ImmN = N->getOperand(N->getNumOperands()-1);
5853 DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5857 case X86ISD::PSHUFLW:
5858 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5859 ImmN = N->getOperand(N->getNumOperands()-1);
5860 DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5864 case X86ISD::VZEXT_MOVL:
5865 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5866 DecodeZeroMoveLowMask(NumElems, Mask);
5869 case X86ISD::VBROADCAST: {
5870 SDValue N0 = N->getOperand(0);
5871 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5872 // add the pre-extracted value to the Ops vector.
5873 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5874 N0.getOperand(0).getValueType() == VT &&
5875 N0.getConstantOperandVal(1) == 0)
5876 Ops.push_back(N0.getOperand(0));
5878 // We only decode broadcasts of same-sized vectors, unless the broadcast
5879 // came from an extract from the original width. If we found one, we
5880 // pushed it the Ops vector above.
5881 if (N0.getValueType() == VT || !Ops.empty()) {
5882 DecodeVectorBroadcast(NumElems, Mask);
5888 case X86ISD::VPERMILPV: {
5889 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5891 SDValue MaskNode = N->getOperand(1);
5892 unsigned MaskEltSize = VT.getScalarSizeInBits();
5893 SmallVector<uint64_t, 32> RawMask;
5894 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5895 DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
5898 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5899 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5904 case X86ISD::PSHUFB: {
5905 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5906 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5907 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5909 SDValue MaskNode = N->getOperand(1);
5910 SmallVector<uint64_t, 32> RawMask;
5911 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5912 DecodePSHUFBMask(RawMask, Mask);
5915 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5916 DecodePSHUFBMask(C, Mask);
5921 case X86ISD::VPERMI:
5922 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5923 ImmN = N->getOperand(N->getNumOperands()-1);
5924 DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5929 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5930 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5931 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5933 case X86ISD::VPERM2X128:
5934 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5935 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5936 ImmN = N->getOperand(N->getNumOperands()-1);
5937 DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5939 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5941 case X86ISD::SHUF128:
5942 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5943 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5944 ImmN = N->getOperand(N->getNumOperands()-1);
5945 decodeVSHUF64x2FamilyMask(NumElems, VT.getScalarSizeInBits(),
5946 cast<ConstantSDNode>(ImmN)->getZExtValue(),
5948 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5950 case X86ISD::MOVSLDUP:
5951 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5952 DecodeMOVSLDUPMask(NumElems, Mask);
5955 case X86ISD::MOVSHDUP:
5956 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5957 DecodeMOVSHDUPMask(NumElems, Mask);
5960 case X86ISD::MOVDDUP:
5961 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5962 DecodeMOVDDUPMask(NumElems, Mask);
5965 case X86ISD::VPERMIL2: {
5966 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5967 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5968 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5969 unsigned MaskEltSize = VT.getScalarSizeInBits();
5970 SDValue MaskNode = N->getOperand(2);
5971 SDValue CtrlNode = N->getOperand(3);
5972 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5973 unsigned CtrlImm = CtrlOp->getZExtValue();
5974 SmallVector<uint64_t, 32> RawMask;
5975 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5976 DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
5980 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5981 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5987 case X86ISD::VPPERM: {
5988 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5989 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5990 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5991 SDValue MaskNode = N->getOperand(2);
5992 SmallVector<uint64_t, 32> RawMask;
5993 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5994 DecodeVPPERMMask(RawMask, Mask);
5997 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5998 DecodeVPPERMMask(C, Mask);
6003 case X86ISD::VPERMV: {
6004 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6006 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6007 Ops.push_back(N->getOperand(1));
6008 SDValue MaskNode = N->getOperand(0);
6009 SmallVector<uint64_t, 32> RawMask;
6010 unsigned MaskEltSize = VT.getScalarSizeInBits();
6011 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
6012 DecodeVPERMVMask(RawMask, Mask);
6015 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6016 DecodeVPERMVMask(C, MaskEltSize, Mask);
6021 case X86ISD::VPERMV3: {
6022 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6023 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
6024 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
6025 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6026 Ops.push_back(N->getOperand(0));
6027 Ops.push_back(N->getOperand(2));
6028 SDValue MaskNode = N->getOperand(1);
6029 unsigned MaskEltSize = VT.getScalarSizeInBits();
6030 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6031 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
6036 default: llvm_unreachable("unknown target shuffle node");
6039 // Empty mask indicates the decode failed.
6043 // Check if we're getting a shuffle mask with zero'd elements.
6044 if (!AllowSentinelZero)
6045 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
6048 // If we have a fake unary shuffle, the shuffle mask is spread across two
6049 // inputs that are actually the same node. Re-map the mask to always point
6050 // into the first input.
6053 if (M >= (int)Mask.size())
6056 // If we didn't already add operands in the opcode-specific code, default to
6057 // adding 1 or 2 operands starting at 0.
6059 Ops.push_back(N->getOperand(0));
6060 if (!IsUnary || IsFakeUnary)
6061 Ops.push_back(N->getOperand(1));
6067 /// Check a target shuffle mask's inputs to see if we can set any values to
6068 /// SM_SentinelZero - this is for elements that are known to be zero
6069 /// (not just zeroable) from their inputs.
6070 /// Returns true if the target shuffle mask was decoded.
6071 static bool setTargetShuffleZeroElements(SDValue N,
6072 SmallVectorImpl<int> &Mask,
6073 SmallVectorImpl<SDValue> &Ops) {
6075 if (!isTargetShuffle(N.getOpcode()))
6078 MVT VT = N.getSimpleValueType();
6079 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
6082 SDValue V1 = Ops[0];
6083 SDValue V2 = IsUnary ? V1 : Ops[1];
6085 V1 = peekThroughBitcasts(V1);
6086 V2 = peekThroughBitcasts(V2);
6088 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
6089 "Illegal split of shuffle value type");
6090 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
6092 // Extract known constant input data.
6093 APInt UndefSrcElts[2];
6094 SmallVector<APInt, 32> SrcEltBits[2];
6095 bool IsSrcConstant[2] = {
6096 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6097 SrcEltBits[0], true, false),
6098 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6099 SrcEltBits[1], true, false)};
6101 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6104 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6108 // Determine shuffle input and normalize the mask.
6109 unsigned SrcIdx = M / Size;
6110 SDValue V = M < Size ? V1 : V2;
6113 // We are referencing an UNDEF input.
6115 Mask[i] = SM_SentinelUndef;
6119 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6120 // TODO: We currently only set UNDEF for integer types - floats use the same
6121 // registers as vectors and many of the scalar folded loads rely on the
6122 // SCALAR_TO_VECTOR pattern.
6123 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6124 (Size % V.getValueType().getVectorNumElements()) == 0) {
6125 int Scale = Size / V.getValueType().getVectorNumElements();
6126 int Idx = M / Scale;
6127 if (Idx != 0 && !VT.isFloatingPoint())
6128 Mask[i] = SM_SentinelUndef;
6129 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6130 Mask[i] = SM_SentinelZero;
6134 // Attempt to extract from the source's constant bits.
6135 if (IsSrcConstant[SrcIdx]) {
6136 if (UndefSrcElts[SrcIdx][M])
6137 Mask[i] = SM_SentinelUndef;
6138 else if (SrcEltBits[SrcIdx][M] == 0)
6139 Mask[i] = SM_SentinelZero;
6143 assert(VT.getVectorNumElements() == Mask.size() &&
6144 "Different mask size from vector size!");
6148 // Attempt to decode ops that could be represented as a shuffle mask.
6149 // The decoded shuffle mask may contain a different number of elements to the
6150 // destination value type.
6151 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
6152 SmallVectorImpl<SDValue> &Ops,
6153 const SelectionDAG &DAG) {
6157 MVT VT = N.getSimpleValueType();
6158 unsigned NumElts = VT.getVectorNumElements();
6159 unsigned NumSizeInBits = VT.getSizeInBits();
6160 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6161 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
6162 "Expected byte aligned value types");
6164 unsigned Opcode = N.getOpcode();
6166 case ISD::VECTOR_SHUFFLE: {
6167 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6168 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6169 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6170 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6171 Ops.push_back(N.getOperand(0));
6172 Ops.push_back(N.getOperand(1));
6178 case X86ISD::ANDNP: {
6179 // Attempt to decode as a per-byte mask.
6181 SmallVector<APInt, 32> EltBits;
6182 SDValue N0 = N.getOperand(0);
6183 SDValue N1 = N.getOperand(1);
6184 bool IsAndN = (X86ISD::ANDNP == Opcode);
6185 uint64_t ZeroMask = IsAndN ? 255 : 0;
6186 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
6188 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6190 Mask.push_back(SM_SentinelUndef);
6193 uint64_t ByteBits = EltBits[i].getZExtValue();
6194 if (ByteBits != 0 && ByteBits != 255)
6196 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6198 Ops.push_back(IsAndN ? N1 : N0);
6201 case ISD::SCALAR_TO_VECTOR: {
6202 // Match against a scalar_to_vector of an extract from a vector,
6203 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
6204 SDValue N0 = N.getOperand(0);
6207 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6208 N0.getOperand(0).getValueType() == VT) ||
6209 (N0.getOpcode() == X86ISD::PEXTRW &&
6210 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6211 (N0.getOpcode() == X86ISD::PEXTRB &&
6212 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6216 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6219 SDValue SrcVec = SrcExtract.getOperand(0);
6220 EVT SrcVT = SrcVec.getValueType();
6221 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6222 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6224 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6225 if (NumSrcElts <= SrcIdx)
6228 Ops.push_back(SrcVec);
6229 Mask.push_back(SrcIdx);
6230 Mask.append(NumZeros, SM_SentinelZero);
6231 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6234 case X86ISD::PINSRB:
6235 case X86ISD::PINSRW: {
6236 SDValue InVec = N.getOperand(0);
6237 SDValue InScl = N.getOperand(1);
6238 SDValue InIndex = N.getOperand(2);
6239 if (!isa<ConstantSDNode>(InIndex) ||
6240 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
6242 uint64_t InIdx = N.getConstantOperandVal(2);
6244 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6245 if (X86::isZeroNode(InScl)) {
6246 Ops.push_back(InVec);
6247 for (unsigned i = 0; i != NumElts; ++i)
6248 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6252 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6253 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6255 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6256 if (InScl.getOpcode() != ExOp)
6259 SDValue ExVec = InScl.getOperand(0);
6260 SDValue ExIndex = InScl.getOperand(1);
6261 if (!isa<ConstantSDNode>(ExIndex) ||
6262 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
6264 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6266 Ops.push_back(InVec);
6267 Ops.push_back(ExVec);
6268 for (unsigned i = 0; i != NumElts; ++i)
6269 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6272 case X86ISD::PACKSS:
6273 case X86ISD::PACKUS: {
6274 SDValue N0 = N.getOperand(0);
6275 SDValue N1 = N.getOperand(1);
6276 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6277 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6278 "Unexpected input value type");
6280 // If we know input saturation won't happen we can treat this
6281 // as a truncation shuffle.
6282 if (Opcode == X86ISD::PACKSS) {
6283 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6284 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6287 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6288 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6289 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6293 bool IsUnary = (N0 == N1);
6299 createPackShuffleMask(VT, Mask, IsUnary);
6303 case X86ISD::VSRLI: {
6304 uint64_t ShiftVal = N.getConstantOperandVal(1);
6305 // Out of range bit shifts are guaranteed to be zero.
6306 if (NumBitsPerElt <= ShiftVal) {
6307 Mask.append(NumElts, SM_SentinelZero);
6311 // We can only decode 'whole byte' bit shifts as shuffles.
6312 if ((ShiftVal % 8) != 0)
6315 uint64_t ByteShift = ShiftVal / 8;
6316 unsigned NumBytes = NumSizeInBits / 8;
6317 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6318 Ops.push_back(N.getOperand(0));
6320 // Clear mask to all zeros and insert the shifted byte indices.
6321 Mask.append(NumBytes, SM_SentinelZero);
6323 if (X86ISD::VSHLI == Opcode) {
6324 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6325 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6326 Mask[i + j] = i + j - ByteShift;
6328 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6329 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6330 Mask[i + j - ByteShift] = i + j;
6334 case ISD::ZERO_EXTEND_VECTOR_INREG:
6335 case X86ISD::VZEXT: {
6336 // TODO - add support for VPMOVZX with smaller input vector types.
6337 SDValue Src = N.getOperand(0);
6338 MVT SrcVT = Src.getSimpleValueType();
6339 if (NumSizeInBits != SrcVT.getSizeInBits())
6341 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
6342 VT.getVectorNumElements(), Mask);
6351 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6352 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6353 SmallVectorImpl<int> &Mask) {
6354 int MaskWidth = Mask.size();
6355 SmallVector<SDValue, 16> UsedInputs;
6356 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6357 int lo = UsedInputs.size() * MaskWidth;
6358 int hi = lo + MaskWidth;
6360 // Strip UNDEF input usage.
6361 if (Inputs[i].isUndef())
6363 if ((lo <= M) && (M < hi))
6364 M = SM_SentinelUndef;
6366 // Check for unused inputs.
6367 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6368 UsedInputs.push_back(Inputs[i]);
6375 Inputs = UsedInputs;
6378 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6379 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6380 /// remaining input indices in case we now have a unary shuffle and adjust the
6381 /// inputs accordingly.
6382 /// Returns true if the target shuffle mask was decoded.
6383 static bool resolveTargetShuffleInputs(SDValue Op,
6384 SmallVectorImpl<SDValue> &Inputs,
6385 SmallVectorImpl<int> &Mask,
6386 const SelectionDAG &DAG) {
6387 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6388 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6391 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6395 /// Returns the scalar element that will make up the ith
6396 /// element of the result of the vector shuffle.
6397 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6400 return SDValue(); // Limit search depth.
6402 SDValue V = SDValue(N, 0);
6403 EVT VT = V.getValueType();
6404 unsigned Opcode = V.getOpcode();
6406 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6407 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6408 int Elt = SV->getMaskElt(Index);
6411 return DAG.getUNDEF(VT.getVectorElementType());
6413 unsigned NumElems = VT.getVectorNumElements();
6414 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6415 : SV->getOperand(1);
6416 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6419 // Recurse into target specific vector shuffles to find scalars.
6420 if (isTargetShuffle(Opcode)) {
6421 MVT ShufVT = V.getSimpleValueType();
6422 MVT ShufSVT = ShufVT.getVectorElementType();
6423 int NumElems = (int)ShufVT.getVectorNumElements();
6424 SmallVector<int, 16> ShuffleMask;
6425 SmallVector<SDValue, 16> ShuffleOps;
6428 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6431 int Elt = ShuffleMask[Index];
6432 if (Elt == SM_SentinelZero)
6433 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6434 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6435 if (Elt == SM_SentinelUndef)
6436 return DAG.getUNDEF(ShufSVT);
6438 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6439 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6440 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6444 // Actual nodes that may contain scalar elements
6445 if (Opcode == ISD::BITCAST) {
6446 V = V.getOperand(0);
6447 EVT SrcVT = V.getValueType();
6448 unsigned NumElems = VT.getVectorNumElements();
6450 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6454 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6455 return (Index == 0) ? V.getOperand(0)
6456 : DAG.getUNDEF(VT.getVectorElementType());
6458 if (V.getOpcode() == ISD::BUILD_VECTOR)
6459 return V.getOperand(Index);
6464 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6465 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6466 unsigned NumNonZero, unsigned NumZero,
6468 const X86Subtarget &Subtarget) {
6469 MVT VT = Op.getSimpleValueType();
6470 unsigned NumElts = VT.getVectorNumElements();
6471 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6472 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6473 "Illegal vector insertion");
6479 for (unsigned i = 0; i < NumElts; ++i) {
6480 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6484 // If the build vector contains zeros or our first insertion is not the
6485 // first index then insert into zero vector to break any register
6486 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6489 if (NumZero || 0 != i)
6490 V = getZeroVector(VT, Subtarget, DAG, dl);
6492 assert(0 == i && "Expected insertion into zero-index");
6493 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6494 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6495 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6496 V = DAG.getBitcast(VT, V);
6500 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6501 DAG.getIntPtrConstant(i, dl));
6507 /// Custom lower build_vector of v16i8.
6508 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6509 unsigned NumNonZero, unsigned NumZero,
6511 const X86Subtarget &Subtarget) {
6512 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6515 // SSE4.1 - use PINSRB to insert each byte directly.
6516 if (Subtarget.hasSSE41())
6517 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6524 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6525 for (unsigned i = 0; i < 16; ++i) {
6526 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6527 if (ThisIsNonZero && First) {
6529 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6531 V = DAG.getUNDEF(MVT::v8i16);
6536 // FIXME: Investigate extending to i32 instead of just i16.
6537 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6538 SDValue ThisElt, LastElt;
6539 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6540 if (LastIsNonZero) {
6542 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6544 if (ThisIsNonZero) {
6545 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6546 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6547 DAG.getConstant(8, dl, MVT::i8));
6549 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6555 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6556 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6557 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6558 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6559 V = DAG.getBitcast(MVT::v8i16, V);
6561 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6562 DAG.getIntPtrConstant(i / 2, dl));
6568 return DAG.getBitcast(MVT::v16i8, V);
6571 /// Custom lower build_vector of v8i16.
6572 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6573 unsigned NumNonZero, unsigned NumZero,
6575 const X86Subtarget &Subtarget) {
6576 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6579 // Use PINSRW to insert each byte directly.
6580 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6584 /// Custom lower build_vector of v4i32 or v4f32.
6585 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6586 const X86Subtarget &Subtarget) {
6587 // Find all zeroable elements.
6588 std::bitset<4> Zeroable;
6589 for (int i=0; i < 4; ++i) {
6590 SDValue Elt = Op->getOperand(i);
6591 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6593 assert(Zeroable.size() - Zeroable.count() > 1 &&
6594 "We expect at least two non-zero elements!");
6596 // We only know how to deal with build_vector nodes where elements are either
6597 // zeroable or extract_vector_elt with constant index.
6598 SDValue FirstNonZero;
6599 unsigned FirstNonZeroIdx;
6600 for (unsigned i=0; i < 4; ++i) {
6603 SDValue Elt = Op->getOperand(i);
6604 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6605 !isa<ConstantSDNode>(Elt.getOperand(1)))
6607 // Make sure that this node is extracting from a 128-bit vector.
6608 MVT VT = Elt.getOperand(0).getSimpleValueType();
6609 if (!VT.is128BitVector())
6611 if (!FirstNonZero.getNode()) {
6613 FirstNonZeroIdx = i;
6617 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6618 SDValue V1 = FirstNonZero.getOperand(0);
6619 MVT VT = V1.getSimpleValueType();
6621 // See if this build_vector can be lowered as a blend with zero.
6623 unsigned EltMaskIdx, EltIdx;
6625 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6626 if (Zeroable[EltIdx]) {
6627 // The zero vector will be on the right hand side.
6628 Mask[EltIdx] = EltIdx+4;
6632 Elt = Op->getOperand(EltIdx);
6633 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6634 EltMaskIdx = Elt.getConstantOperandVal(1);
6635 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6637 Mask[EltIdx] = EltIdx;
6641 // Let the shuffle legalizer deal with blend operations.
6642 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6643 if (V1.getSimpleValueType() != VT)
6644 V1 = DAG.getBitcast(VT, V1);
6645 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6648 // See if we can lower this build_vector to a INSERTPS.
6649 if (!Subtarget.hasSSE41())
6652 SDValue V2 = Elt.getOperand(0);
6653 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6656 bool CanFold = true;
6657 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6661 SDValue Current = Op->getOperand(i);
6662 SDValue SrcVector = Current->getOperand(0);
6665 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6671 assert(V1.getNode() && "Expected at least two non-zero elements!");
6672 if (V1.getSimpleValueType() != MVT::v4f32)
6673 V1 = DAG.getBitcast(MVT::v4f32, V1);
6674 if (V2.getSimpleValueType() != MVT::v4f32)
6675 V2 = DAG.getBitcast(MVT::v4f32, V2);
6677 // Ok, we can emit an INSERTPS instruction.
6678 unsigned ZMask = Zeroable.to_ulong();
6680 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6681 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6683 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6684 DAG.getIntPtrConstant(InsertPSMask, DL));
6685 return DAG.getBitcast(VT, Result);
6688 /// Return a vector logical shift node.
6689 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6690 SelectionDAG &DAG, const TargetLowering &TLI,
6692 assert(VT.is128BitVector() && "Unknown type for VShift");
6693 MVT ShVT = MVT::v16i8;
6694 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6695 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6696 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6697 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
6698 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6701 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6702 SelectionDAG &DAG) {
6704 // Check if the scalar load can be widened into a vector load. And if
6705 // the address is "base + cst" see if the cst can be "absorbed" into
6706 // the shuffle mask.
6707 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6708 SDValue Ptr = LD->getBasePtr();
6709 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6711 EVT PVT = LD->getValueType(0);
6712 if (PVT != MVT::i32 && PVT != MVT::f32)
6717 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6718 FI = FINode->getIndex();
6720 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6721 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6722 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6723 Offset = Ptr.getConstantOperandVal(1);
6724 Ptr = Ptr.getOperand(0);
6729 // FIXME: 256-bit vector instructions don't require a strict alignment,
6730 // improve this code to support it better.
6731 unsigned RequiredAlign = VT.getSizeInBits()/8;
6732 SDValue Chain = LD->getChain();
6733 // Make sure the stack object alignment is at least 16 or 32.
6734 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6735 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6736 if (MFI.isFixedObjectIndex(FI)) {
6737 // Can't change the alignment. FIXME: It's possible to compute
6738 // the exact stack offset and reference FI + adjust offset instead.
6739 // If someone *really* cares about this. That's the way to implement it.
6742 MFI.setObjectAlignment(FI, RequiredAlign);
6746 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6747 // Ptr + (Offset & ~15).
6750 if ((Offset % RequiredAlign) & 3)
6752 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6755 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6756 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6759 int EltNo = (Offset - StartOffset) >> 2;
6760 unsigned NumElems = VT.getVectorNumElements();
6762 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6763 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6764 LD->getPointerInfo().getWithOffset(StartOffset));
6766 SmallVector<int, 8> Mask(NumElems, EltNo);
6768 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6774 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6775 /// elements can be replaced by a single large load which has the same value as
6776 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6778 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6779 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6780 const SDLoc &DL, SelectionDAG &DAG,
6781 const X86Subtarget &Subtarget,
6782 bool isAfterLegalize) {
6783 unsigned NumElems = Elts.size();
6785 int LastLoadedElt = -1;
6786 SmallBitVector LoadMask(NumElems, false);
6787 SmallBitVector ZeroMask(NumElems, false);
6788 SmallBitVector UndefMask(NumElems, false);
6790 // For each element in the initializer, see if we've found a load, zero or an
6792 for (unsigned i = 0; i < NumElems; ++i) {
6793 SDValue Elt = peekThroughBitcasts(Elts[i]);
6798 UndefMask[i] = true;
6799 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6801 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6804 // Each loaded element must be the correct fractional portion of the
6805 // requested vector load.
6806 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6811 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6812 "Incomplete element masks");
6814 // Handle Special Cases - all undef or undef/zero.
6815 if (UndefMask.count() == NumElems)
6816 return DAG.getUNDEF(VT);
6818 // FIXME: Should we return this as a BUILD_VECTOR instead?
6819 if ((ZeroMask | UndefMask).count() == NumElems)
6820 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6821 : DAG.getConstantFP(0.0, DL, VT);
6823 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6824 int FirstLoadedElt = LoadMask.find_first();
6825 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6826 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6827 EVT LDBaseVT = EltBase.getValueType();
6829 // Consecutive loads can contain UNDEFS but not ZERO elements.
6830 // Consecutive loads with UNDEFs and ZEROs elements require a
6831 // an additional shuffle stage to clear the ZERO elements.
6832 bool IsConsecutiveLoad = true;
6833 bool IsConsecutiveLoadWithZeros = true;
6834 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6836 SDValue Elt = peekThroughBitcasts(Elts[i]);
6837 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6838 if (!DAG.areNonVolatileConsecutiveLoads(
6839 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6840 i - FirstLoadedElt)) {
6841 IsConsecutiveLoad = false;
6842 IsConsecutiveLoadWithZeros = false;
6845 } else if (ZeroMask[i]) {
6846 IsConsecutiveLoad = false;
6850 SmallVector<LoadSDNode *, 8> Loads;
6851 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6853 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6855 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6856 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6857 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6858 "Cannot merge volatile loads.");
6860 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6861 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6862 for (auto *LD : Loads)
6863 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6867 // LOAD - all consecutive load/undefs (must start/end with a load).
6868 // If we have found an entire vector of loads and undefs, then return a large
6869 // load of the entire vector width starting at the base pointer.
6870 // If the vector contains zeros, then attempt to shuffle those elements.
6871 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6872 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6873 assert(LDBase && "Did not find base load for merging consecutive loads");
6874 EVT EltVT = LDBase->getValueType(0);
6875 // Ensure that the input vector size for the merged loads matches the
6876 // cumulative size of the input elements.
6877 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6880 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6883 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6884 // will lower to regular temporal loads and use the cache.
6885 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6886 VT.is256BitVector() && !Subtarget.hasInt256())
6889 if (IsConsecutiveLoad)
6890 return CreateLoad(VT, LDBase);
6892 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6893 // vector and a zero vector to clear out the zero elements.
6894 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6895 SmallVector<int, 4> ClearMask(NumElems, -1);
6896 for (unsigned i = 0; i < NumElems; ++i) {
6898 ClearMask[i] = i + NumElems;
6899 else if (LoadMask[i])
6902 SDValue V = CreateLoad(VT, LDBase);
6903 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6904 : DAG.getConstantFP(0.0, DL, VT);
6905 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6910 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6912 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6913 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6914 (LoadSize == 32 || LoadSize == 64) &&
6915 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6916 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6917 : MVT::getIntegerVT(LoadSize);
6918 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6919 if (TLI.isTypeLegal(VecVT)) {
6920 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6921 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6923 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6924 LDBase->getPointerInfo(),
6925 LDBase->getAlignment(),
6926 MachineMemOperand::MOLoad);
6927 for (auto *LD : Loads)
6928 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6929 return DAG.getBitcast(VT, ResNode);
6936 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6937 unsigned SplatBitSize, LLVMContext &C) {
6938 unsigned ScalarSize = VT.getScalarSizeInBits();
6939 unsigned NumElm = SplatBitSize / ScalarSize;
6941 SmallVector<Constant *, 32> ConstantVec;
6942 for (unsigned i = 0; i < NumElm; i++) {
6943 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6945 if (VT.isFloatingPoint()) {
6946 if (ScalarSize == 32) {
6947 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6949 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6950 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6953 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6954 ConstantVec.push_back(Const);
6956 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6959 static bool isUseOfShuffle(SDNode *N) {
6960 for (auto *U : N->uses()) {
6961 if (isTargetShuffle(U->getOpcode()))
6963 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6964 return isUseOfShuffle(U);
6969 // Check if the current node of build vector is a zero extended vector.
6970 // // If so, return the value extended.
6971 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6972 // // NumElt - return the number of zero extended identical values.
6973 // // EltType - return the type of the value include the zero extend.
6974 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6975 unsigned &NumElt, MVT &EltType) {
6976 SDValue ExtValue = Op->getOperand(0);
6977 unsigned NumElts = Op->getNumOperands();
6978 unsigned Delta = NumElts;
6980 for (unsigned i = 1; i < NumElts; i++) {
6981 if (Op->getOperand(i) == ExtValue) {
6985 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6988 if (!isPowerOf2_32(Delta) || Delta == 1)
6991 for (unsigned i = Delta; i < NumElts; i++) {
6992 if (i % Delta == 0) {
6993 if (Op->getOperand(i) != ExtValue)
6995 } else if (!(isNullConstant(Op->getOperand(i)) ||
6996 Op->getOperand(i).isUndef()))
6999 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
7000 unsigned ExtVTSize = EltSize * Delta;
7001 EltType = MVT::getIntegerVT(ExtVTSize);
7002 NumElt = NumElts / Delta;
7006 /// Attempt to use the vbroadcast instruction to generate a splat value
7007 /// from a splat BUILD_VECTOR which uses:
7008 /// a. A single scalar load, or a constant.
7009 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7011 /// The VBROADCAST node is returned when a pattern is found,
7012 /// or SDValue() otherwise.
7013 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
7014 const X86Subtarget &Subtarget,
7015 SelectionDAG &DAG) {
7016 // VBROADCAST requires AVX.
7017 // TODO: Splats could be generated for non-AVX CPUs using SSE
7018 // instructions, but there's less potential gain for only 128-bit vectors.
7019 if (!Subtarget.hasAVX())
7022 MVT VT = BVOp->getSimpleValueType(0);
7025 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7026 "Unsupported vector type for broadcast.");
7028 BitVector UndefElements;
7029 SDValue Ld = BVOp->getSplatValue(&UndefElements);
7031 // Attempt to use VBROADCASTM
7032 // From this paterrn:
7033 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7034 // b. t1 = (build_vector t0 t0)
7036 // Create (VBROADCASTM v2i1 X)
7037 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
7038 MVT EltType = VT.getScalarType();
7039 unsigned NumElts = VT.getVectorNumElements();
7041 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
7042 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
7043 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
7044 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
7046 BOperand = ZeroExtended.getOperand(0);
7048 BOperand = Ld.getOperand(0).getOperand(0);
7049 MVT MaskVT = BOperand.getSimpleValueType();
7050 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7051 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7053 DAG.getNode(X86ISD::VBROADCASTM, dl,
7054 MVT::getVectorVT(EltType, NumElts), BOperand);
7055 return DAG.getBitcast(VT, Brdcst);
7060 // We need a splat of a single value to use broadcast, and it doesn't
7061 // make any sense if the value is only in one element of the vector.
7062 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
7063 APInt SplatValue, Undef;
7064 unsigned SplatBitSize;
7066 // Check if this is a repeated constant pattern suitable for broadcasting.
7067 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7068 SplatBitSize > VT.getScalarSizeInBits() &&
7069 SplatBitSize < VT.getSizeInBits()) {
7070 // Avoid replacing with broadcast when it's a use of a shuffle
7071 // instruction to preserve the present custom lowering of shuffles.
7072 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
7074 // replace BUILD_VECTOR with broadcast of the repeated constants.
7075 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7076 LLVMContext *Ctx = DAG.getContext();
7077 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7078 if (Subtarget.hasAVX()) {
7079 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
7080 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
7081 // Splatted value can fit in one INTEGER constant in constant pool.
7082 // Load the constant and broadcast it.
7083 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7084 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
7085 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
7086 SDValue CP = DAG.getConstantPool(C, PVT);
7087 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7089 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7091 CVT, dl, DAG.getEntryNode(), CP,
7092 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7094 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7095 MVT::getVectorVT(CVT, Repeat), Ld);
7096 return DAG.getBitcast(VT, Brdcst);
7097 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
7098 // Splatted value can fit in one FLOAT constant in constant pool.
7099 // Load the constant and broadcast it.
7100 // AVX have support for 32 and 64 bit broadcast for floats only.
7101 // No 64bit integer in 32bit subtarget.
7102 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
7103 // Lower the splat via APFloat directly, to avoid any conversion.
7106 ? ConstantFP::get(*Ctx,
7107 APFloat(APFloat::IEEEsingle(), SplatValue))
7108 : ConstantFP::get(*Ctx,
7109 APFloat(APFloat::IEEEdouble(), SplatValue));
7110 SDValue CP = DAG.getConstantPool(C, PVT);
7111 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7113 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7115 CVT, dl, DAG.getEntryNode(), CP,
7116 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7118 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7119 MVT::getVectorVT(CVT, Repeat), Ld);
7120 return DAG.getBitcast(VT, Brdcst);
7121 } else if (SplatBitSize > 64) {
7122 // Load the vector of constants and broadcast it.
7123 MVT CVT = VT.getScalarType();
7124 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
7126 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7127 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7128 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
7130 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
7131 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7133 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
7134 return DAG.getBitcast(VT, Brdcst);
7141 bool ConstSplatVal =
7142 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7144 // Make sure that all of the users of a non-constant load are from the
7145 // BUILD_VECTOR node.
7146 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
7149 unsigned ScalarSize = Ld.getValueSizeInBits();
7150 bool IsGE256 = (VT.getSizeInBits() >= 256);
7152 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7153 // instruction to save 8 or more bytes of constant pool data.
7154 // TODO: If multiple splats are generated to load the same constant,
7155 // it may be detrimental to overall size. There needs to be a way to detect
7156 // that condition to know if this is truly a size win.
7157 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
7159 // Handle broadcasting a single constant scalar from the constant pool
7161 // On Sandybridge (no AVX2), it is still better to load a constant vector
7162 // from the constant pool and not to broadcast it from a scalar.
7163 // But override that restriction when optimizing for size.
7164 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7165 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7166 EVT CVT = Ld.getValueType();
7167 assert(!CVT.isVector() && "Must not broadcast a vector type");
7169 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
7170 // For size optimization, also splat v2f64 and v2i64, and for size opt
7171 // with AVX2, also splat i8 and i16.
7172 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7173 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7174 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7175 const Constant *C = nullptr;
7176 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7177 C = CI->getConstantIntValue();
7178 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7179 C = CF->getConstantFPValue();
7181 assert(C && "Invalid constant type");
7183 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7185 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
7186 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7188 CVT, dl, DAG.getEntryNode(), CP,
7189 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7192 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7196 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7198 // Handle AVX2 in-register broadcasts.
7199 if (!IsLoad && Subtarget.hasInt256() &&
7200 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7201 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7203 // The scalar source must be a normal load.
7207 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7208 (Subtarget.hasVLX() && ScalarSize == 64))
7209 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7211 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7212 // double since there is no vbroadcastsd xmm
7213 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7214 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7215 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7218 // Unsupported broadcast.
7222 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
7223 /// underlying vector and index.
7225 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7227 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7229 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7230 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7233 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7235 // (extract_vector_elt (v8f32 %1), Constant<6>)
7237 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7238 // (extract_subvector (v8f32 %0), Constant<4>),
7241 // In this case the vector is the extract_subvector expression and the index
7242 // is 2, as specified by the shuffle.
7243 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7244 SDValue ShuffleVec = SVOp->getOperand(0);
7245 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7246 assert(ShuffleVecVT.getVectorElementType() ==
7247 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7249 int ShuffleIdx = SVOp->getMaskElt(Idx);
7250 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7251 ExtractedFromVec = ShuffleVec;
7257 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7258 MVT VT = Op.getSimpleValueType();
7260 // Skip if insert_vec_elt is not supported.
7261 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7262 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7266 unsigned NumElems = Op.getNumOperands();
7270 SmallVector<unsigned, 4> InsertIndices;
7271 SmallVector<int, 8> Mask(NumElems, -1);
7273 for (unsigned i = 0; i != NumElems; ++i) {
7274 unsigned Opc = Op.getOperand(i).getOpcode();
7276 if (Opc == ISD::UNDEF)
7279 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7280 // Quit if more than 1 elements need inserting.
7281 if (InsertIndices.size() > 1)
7284 InsertIndices.push_back(i);
7288 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7289 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7291 // Quit if non-constant index.
7292 if (!isa<ConstantSDNode>(ExtIdx))
7294 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7296 // Quit if extracted from vector of different type.
7297 if (ExtractedFromVec.getValueType() != VT)
7300 if (!VecIn1.getNode())
7301 VecIn1 = ExtractedFromVec;
7302 else if (VecIn1 != ExtractedFromVec) {
7303 if (!VecIn2.getNode())
7304 VecIn2 = ExtractedFromVec;
7305 else if (VecIn2 != ExtractedFromVec)
7306 // Quit if more than 2 vectors to shuffle
7310 if (ExtractedFromVec == VecIn1)
7312 else if (ExtractedFromVec == VecIn2)
7313 Mask[i] = Idx + NumElems;
7316 if (!VecIn1.getNode())
7319 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7320 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7322 for (unsigned Idx : InsertIndices)
7323 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7324 DAG.getIntPtrConstant(Idx, DL));
7329 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7330 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7331 Op.getScalarValueSizeInBits() == 1 &&
7332 "Can not convert non-constant vector");
7333 uint64_t Immediate = 0;
7334 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7335 SDValue In = Op.getOperand(idx);
7337 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7340 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7341 return DAG.getConstant(Immediate, dl, VT);
7343 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7344 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7345 const X86Subtarget &Subtarget) {
7347 MVT VT = Op.getSimpleValueType();
7348 assert((VT.getVectorElementType() == MVT::i1) &&
7349 "Unexpected type in LowerBUILD_VECTORvXi1!");
7352 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7355 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7358 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7359 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7360 // Split the pieces.
7362 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7364 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7365 // We have to manually lower both halves so getNode doesn't try to
7366 // reassemble the build_vector.
7367 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7368 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7369 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7371 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7372 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7373 return DAG.getBitcast(VT, Imm);
7374 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7375 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7376 DAG.getIntPtrConstant(0, dl));
7379 // Vector has one or more non-const elements
7380 uint64_t Immediate = 0;
7381 SmallVector<unsigned, 16> NonConstIdx;
7382 bool IsSplat = true;
7383 bool HasConstElts = false;
7385 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7386 SDValue In = Op.getOperand(idx);
7389 if (!isa<ConstantSDNode>(In))
7390 NonConstIdx.push_back(idx);
7392 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7393 HasConstElts = true;
7397 else if (In != Op.getOperand(SplatIdx))
7401 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7403 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7404 DAG.getConstant(1, dl, VT),
7405 DAG.getConstant(0, dl, VT));
7407 // insert elements one by one
7411 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7412 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7414 else if (HasConstElts)
7415 Imm = DAG.getConstant(0, dl, VT);
7417 Imm = DAG.getUNDEF(VT);
7418 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7419 DstVec = DAG.getBitcast(VT, Imm);
7421 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7422 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7423 DAG.getIntPtrConstant(0, dl));
7426 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7427 unsigned InsertIdx = NonConstIdx[i];
7428 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7429 Op.getOperand(InsertIdx),
7430 DAG.getIntPtrConstant(InsertIdx, dl));
7435 /// Return true if \p N implements a horizontal binop and return the
7436 /// operands for the horizontal binop into V0 and V1.
7438 /// This is a helper function of LowerToHorizontalOp().
7439 /// This function checks that the build_vector \p N in input implements a
7440 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7441 /// operation to match.
7442 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7443 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7444 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7447 /// This function only analyzes elements of \p N whose indices are
7448 /// in range [BaseIdx, LastIdx).
7449 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7451 unsigned BaseIdx, unsigned LastIdx,
7452 SDValue &V0, SDValue &V1) {
7453 EVT VT = N->getValueType(0);
7455 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7456 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7457 "Invalid Vector in input!");
7459 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7460 bool CanFold = true;
7461 unsigned ExpectedVExtractIdx = BaseIdx;
7462 unsigned NumElts = LastIdx - BaseIdx;
7463 V0 = DAG.getUNDEF(VT);
7464 V1 = DAG.getUNDEF(VT);
7466 // Check if N implements a horizontal binop.
7467 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7468 SDValue Op = N->getOperand(i + BaseIdx);
7471 if (Op->isUndef()) {
7472 // Update the expected vector extract index.
7473 if (i * 2 == NumElts)
7474 ExpectedVExtractIdx = BaseIdx;
7475 ExpectedVExtractIdx += 2;
7479 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7484 SDValue Op0 = Op.getOperand(0);
7485 SDValue Op1 = Op.getOperand(1);
7487 // Try to match the following pattern:
7488 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7489 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7490 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7491 Op0.getOperand(0) == Op1.getOperand(0) &&
7492 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7493 isa<ConstantSDNode>(Op1.getOperand(1)));
7497 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7498 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7500 if (i * 2 < NumElts) {
7502 V0 = Op0.getOperand(0);
7503 if (V0.getValueType() != VT)
7508 V1 = Op0.getOperand(0);
7509 if (V1.getValueType() != VT)
7512 if (i * 2 == NumElts)
7513 ExpectedVExtractIdx = BaseIdx;
7516 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7517 if (I0 == ExpectedVExtractIdx)
7518 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7519 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7520 // Try to match the following dag sequence:
7521 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7522 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7526 ExpectedVExtractIdx += 2;
7532 /// Emit a sequence of two 128-bit horizontal add/sub followed by
7533 /// a concat_vector.
7535 /// This is a helper function of LowerToHorizontalOp().
7536 /// This function expects two 256-bit vectors called V0 and V1.
7537 /// At first, each vector is split into two separate 128-bit vectors.
7538 /// Then, the resulting 128-bit vectors are used to implement two
7539 /// horizontal binary operations.
7541 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7543 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7544 /// the two new horizontal binop.
7545 /// When Mode is set, the first horizontal binop dag node would take as input
7546 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7547 /// horizontal binop dag node would take as input the lower 128-bit of V1
7548 /// and the upper 128-bit of V1.
7550 /// HADD V0_LO, V0_HI
7551 /// HADD V1_LO, V1_HI
7553 /// Otherwise, the first horizontal binop dag node takes as input the lower
7554 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7555 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7557 /// HADD V0_LO, V1_LO
7558 /// HADD V0_HI, V1_HI
7560 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7561 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7562 /// the upper 128-bits of the result.
7563 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7564 const SDLoc &DL, SelectionDAG &DAG,
7565 unsigned X86Opcode, bool Mode,
7566 bool isUndefLO, bool isUndefHI) {
7567 MVT VT = V0.getSimpleValueType();
7568 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7569 "Invalid nodes in input!");
7571 unsigned NumElts = VT.getVectorNumElements();
7572 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7573 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7574 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7575 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7576 MVT NewVT = V0_LO.getSimpleValueType();
7578 SDValue LO = DAG.getUNDEF(NewVT);
7579 SDValue HI = DAG.getUNDEF(NewVT);
7582 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7583 if (!isUndefLO && !V0->isUndef())
7584 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7585 if (!isUndefHI && !V1->isUndef())
7586 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7588 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7589 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7590 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7592 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7593 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7596 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7599 /// Returns true iff \p BV builds a vector with the result equivalent to
7600 /// the result of ADDSUB/SUBADD operation.
7601 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7602 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7603 /// \p Opnd0 and \p Opnd1.
7604 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
7605 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7606 SDValue &Opnd0, SDValue &Opnd1,
7607 unsigned &NumExtracts,
7610 MVT VT = BV->getSimpleValueType(0);
7611 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7614 unsigned NumElts = VT.getVectorNumElements();
7615 SDValue InVec0 = DAG.getUNDEF(VT);
7616 SDValue InVec1 = DAG.getUNDEF(VT);
7620 // Odd-numbered elements in the input build vector are obtained from
7621 // adding/subtracting two integer/float elements.
7622 // Even-numbered elements in the input build vector are obtained from
7623 // subtracting/adding two integer/float elements.
7624 unsigned Opc[2] {0, 0};
7625 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7626 SDValue Op = BV->getOperand(i);
7628 // Skip 'undef' values.
7629 unsigned Opcode = Op.getOpcode();
7630 if (Opcode == ISD::UNDEF)
7633 // Early exit if we found an unexpected opcode.
7634 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7637 SDValue Op0 = Op.getOperand(0);
7638 SDValue Op1 = Op.getOperand(1);
7640 // Try to match the following pattern:
7641 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7642 // Early exit if we cannot match that sequence.
7643 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7644 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7645 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7646 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7647 Op0.getOperand(1) != Op1.getOperand(1))
7650 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7654 // We found a valid add/sub node, make sure its the same opcode as previous
7655 // elements for this parity.
7656 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7658 Opc[i % 2] = Opcode;
7660 // Update InVec0 and InVec1.
7661 if (InVec0.isUndef()) {
7662 InVec0 = Op0.getOperand(0);
7663 if (InVec0.getSimpleValueType() != VT)
7666 if (InVec1.isUndef()) {
7667 InVec1 = Op1.getOperand(0);
7668 if (InVec1.getSimpleValueType() != VT)
7672 // Make sure that operands in input to each add/sub node always
7673 // come from a same pair of vectors.
7674 if (InVec0 != Op0.getOperand(0)) {
7675 if (Opcode == ISD::FSUB)
7678 // FADD is commutable. Try to commute the operands
7679 // and then test again.
7680 std::swap(Op0, Op1);
7681 if (InVec0 != Op0.getOperand(0))
7685 if (InVec1 != Op1.getOperand(0))
7688 // Increment the number of extractions done.
7692 // Ensure we have found an opcode for both parities and that they are
7693 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7694 // inputs are undef.
7695 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7696 InVec0.isUndef() || InVec1.isUndef())
7699 IsSubAdd = Opc[0] == ISD::FADD;
7706 /// Returns true if is possible to fold MUL and an idiom that has already been
7707 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7708 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7709 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7711 /// Prior to calling this function it should be known that there is some
7712 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7713 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7714 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7715 /// of \p Opnd0 uses is expected to be equal to 2.
7716 /// For example, this function may be called for the following IR:
7717 /// %AB = fmul fast <2 x double> %A, %B
7718 /// %Sub = fsub fast <2 x double> %AB, %C
7719 /// %Add = fadd fast <2 x double> %AB, %C
7720 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7721 /// <2 x i32> <i32 0, i32 3>
7722 /// There is a def for %Addsub here, which potentially can be replaced by
7723 /// X86ISD::ADDSUB operation:
7724 /// %Addsub = X86ISD::ADDSUB %AB, %C
7725 /// and such ADDSUB can further be replaced with FMADDSUB:
7726 /// %Addsub = FMADDSUB %A, %B, %C.
7728 /// The main reason why this method is called before the replacement of the
7729 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7730 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7732 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7734 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7735 unsigned ExpectedUses) {
7736 if (Opnd0.getOpcode() != ISD::FMUL ||
7737 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7740 // FIXME: These checks must match the similar ones in
7741 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7742 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7743 // or MUL + ADDSUB to FMADDSUB.
7744 const TargetOptions &Options = DAG.getTarget().Options;
7746 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7751 Opnd1 = Opnd0.getOperand(1);
7752 Opnd0 = Opnd0.getOperand(0);
7757 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
7758 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
7759 /// X86ISD::FMSUBADD node.
7760 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7761 const X86Subtarget &Subtarget,
7762 SelectionDAG &DAG) {
7763 SDValue Opnd0, Opnd1;
7764 unsigned NumExtracts;
7766 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
7770 MVT VT = BV->getSimpleValueType(0);
7773 // Try to generate X86ISD::FMADDSUB node here.
7775 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
7776 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
7777 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
7780 // We only support ADDSUB.
7784 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7785 // the ADDSUB idiom has been successfully recognized. There are no known
7786 // X86 targets with 512-bit ADDSUB instructions!
7787 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7789 if (VT.is512BitVector())
7792 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7795 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7796 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7797 const X86Subtarget &Subtarget,
7798 SelectionDAG &DAG) {
7799 MVT VT = BV->getSimpleValueType(0);
7800 unsigned NumElts = VT.getVectorNumElements();
7801 unsigned NumUndefsLO = 0;
7802 unsigned NumUndefsHI = 0;
7803 unsigned Half = NumElts/2;
7805 // Count the number of UNDEF operands in the build_vector in input.
7806 for (unsigned i = 0, e = Half; i != e; ++i)
7807 if (BV->getOperand(i)->isUndef())
7810 for (unsigned i = Half, e = NumElts; i != e; ++i)
7811 if (BV->getOperand(i)->isUndef())
7814 // Early exit if this is either a build_vector of all UNDEFs or all the
7815 // operands but one are UNDEF.
7816 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7820 SDValue InVec0, InVec1;
7821 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7822 // Try to match an SSE3 float HADD/HSUB.
7823 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7824 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7826 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7827 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7828 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7829 // Try to match an SSSE3 integer HADD/HSUB.
7830 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7831 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7833 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7834 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7837 if (!Subtarget.hasAVX())
7840 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7841 // Try to match an AVX horizontal add/sub of packed single/double
7842 // precision floating point values from 256-bit vectors.
7843 SDValue InVec2, InVec3;
7844 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7845 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7846 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7847 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7848 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7850 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7851 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7852 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7853 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7854 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7855 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7856 // Try to match an AVX2 horizontal add/sub of signed integers.
7857 SDValue InVec2, InVec3;
7859 bool CanFold = true;
7861 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7862 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7863 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7864 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7865 X86Opcode = X86ISD::HADD;
7866 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7867 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7868 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7869 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7870 X86Opcode = X86ISD::HSUB;
7875 // Fold this build_vector into a single horizontal add/sub.
7876 // Do this only if the target has AVX2.
7877 if (Subtarget.hasAVX2())
7878 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7880 // Do not try to expand this build_vector into a pair of horizontal
7881 // add/sub if we can emit a pair of scalar add/sub.
7882 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7885 // Convert this build_vector into a pair of horizontal binop followed by
7887 bool isUndefLO = NumUndefsLO == Half;
7888 bool isUndefHI = NumUndefsHI == Half;
7889 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7890 isUndefLO, isUndefHI);
7894 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7895 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7897 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7898 X86Opcode = X86ISD::HADD;
7899 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7900 X86Opcode = X86ISD::HSUB;
7901 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7902 X86Opcode = X86ISD::FHADD;
7903 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7904 X86Opcode = X86ISD::FHSUB;
7908 // Don't try to expand this build_vector into a pair of horizontal add/sub
7909 // if we can simply emit a pair of scalar add/sub.
7910 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7913 // Convert this build_vector into two horizontal add/sub followed by
7915 bool isUndefLO = NumUndefsLO == Half;
7916 bool isUndefHI = NumUndefsHI == Half;
7917 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7918 isUndefLO, isUndefHI);
7924 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7925 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7926 /// just apply the bit to the vectors.
7927 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7928 /// from this, but enough scalar bit operations are created from the later
7929 /// legalization + scalarization stages to need basic support.
7930 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7931 SelectionDAG &DAG) {
7933 MVT VT = Op->getSimpleValueType(0);
7934 unsigned NumElems = VT.getVectorNumElements();
7935 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7937 // Check that all elements have the same opcode.
7938 // TODO: Should we allow UNDEFS and if so how many?
7939 unsigned Opcode = Op->getOperand(0).getOpcode();
7940 for (unsigned i = 1; i < NumElems; ++i)
7941 if (Opcode != Op->getOperand(i).getOpcode())
7944 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7951 // Don't do this if the buildvector is a splat - we'd replace one
7952 // constant with an entire vector.
7953 if (Op->getSplatValue())
7955 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7960 SmallVector<SDValue, 4> LHSElts, RHSElts;
7961 for (SDValue Elt : Op->ops()) {
7962 SDValue LHS = Elt.getOperand(0);
7963 SDValue RHS = Elt.getOperand(1);
7965 // We expect the canonicalized RHS operand to be the constant.
7966 if (!isa<ConstantSDNode>(RHS))
7968 LHSElts.push_back(LHS);
7969 RHSElts.push_back(RHS);
7972 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7973 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7974 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7977 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7978 /// functionality to do this, so it's all zeros, all ones, or some derivation
7979 /// that is cheap to calculate.
7980 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7981 const X86Subtarget &Subtarget) {
7983 MVT VT = Op.getSimpleValueType();
7985 // Vectors containing all zeros can be matched by pxor and xorps.
7986 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7987 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7988 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7989 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7992 return getZeroVector(VT, Subtarget, DAG, DL);
7995 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7996 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7997 // vpcmpeqd on 256-bit vectors.
7998 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7999 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
8000 (VT == MVT::v8i32 && Subtarget.hasInt256()))
8003 return getOnesVector(VT, DAG, DL);
8009 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8010 /// from a vector of source values and a vector of extraction indices.
8011 /// The vectors might be manipulated to match the type of the permute op.
8012 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8013 SDLoc &DL, SelectionDAG &DAG,
8014 const X86Subtarget &Subtarget) {
8016 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8017 unsigned NumElts = VT.getVectorNumElements();
8018 unsigned SizeInBits = VT.getSizeInBits();
8020 // Adjust IndicesVec to match VT size.
8021 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8022 "Illegal variable permute mask size");
8023 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8024 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8025 NumElts * VT.getScalarSizeInBits());
8026 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8028 // Handle SrcVec that don't match VT type.
8029 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8030 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8031 // Handle larger SrcVec by treating it as a larger permute.
8032 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8033 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8034 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8035 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8036 Subtarget, DAG, SDLoc(IndicesVec));
8037 return extractSubVector(
8038 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
8039 DAG, DL, SizeInBits);
8040 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8041 // Widen smaller SrcVec to match VT.
8042 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8047 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8048 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8049 EVT SrcVT = Idx.getValueType();
8050 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8051 uint64_t IndexScale = 0;
8052 uint64_t IndexOffset = 0;
8054 // If we're scaling a smaller permute op, then we need to repeat the
8055 // indices, scaling and offsetting them as well.
8056 // e.g. v4i32 -> v16i8 (Scale = 4)
8057 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8058 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8059 for (uint64_t i = 0; i != Scale; ++i) {
8060 IndexScale |= Scale << (i * NumDstBits);
8061 IndexOffset |= i << (i * NumDstBits);
8064 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8065 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8066 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8067 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8071 unsigned Opcode = 0;
8072 switch (VT.SimpleTy) {
8076 if (Subtarget.hasSSSE3())
8077 Opcode = X86ISD::PSHUFB;
8080 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8081 Opcode = X86ISD::VPERMV;
8082 else if (Subtarget.hasSSSE3()) {
8083 Opcode = X86ISD::PSHUFB;
8084 ShuffleVT = MVT::v16i8;
8089 if (Subtarget.hasAVX()) {
8090 Opcode = X86ISD::VPERMILPV;
8091 ShuffleVT = MVT::v4f32;
8092 } else if (Subtarget.hasSSSE3()) {
8093 Opcode = X86ISD::PSHUFB;
8094 ShuffleVT = MVT::v16i8;
8099 if (Subtarget.hasAVX()) {
8100 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8101 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8102 Opcode = X86ISD::VPERMILPV;
8103 ShuffleVT = MVT::v2f64;
8104 } else if (Subtarget.hasSSE41()) {
8105 // SSE41 can compare v2i64 - select between indices 0 and 1.
8106 return DAG.getSelectCC(
8108 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8109 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8110 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8111 ISD::CondCode::SETEQ);
8115 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8116 Opcode = X86ISD::VPERMV;
8117 else if (Subtarget.hasXOP()) {
8118 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8119 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8120 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8121 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8123 ISD::CONCAT_VECTORS, DL, VT,
8124 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8125 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8126 } else if (Subtarget.hasAVX()) {
8127 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8128 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8129 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8130 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8131 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8132 ArrayRef<SDValue> Ops) {
8133 // Permute Lo and Hi and then select based on index range.
8134 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8135 // care about the bit[7] as its just an index vector.
8136 SDValue Idx = Ops[2];
8137 EVT VT = Idx.getValueType();
8138 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8139 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8140 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8141 ISD::CondCode::SETGT);
8143 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8144 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8149 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8150 Opcode = X86ISD::VPERMV;
8151 else if (Subtarget.hasAVX()) {
8152 // Scale to v32i8 and perform as v32i8.
8153 IndicesVec = ScaleIndices(IndicesVec, 2);
8154 return DAG.getBitcast(
8155 VT, createVariablePermute(
8156 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8157 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8162 if (Subtarget.hasAVX2())
8163 Opcode = X86ISD::VPERMV;
8164 else if (Subtarget.hasAVX()) {
8165 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8166 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8167 {0, 1, 2, 3, 0, 1, 2, 3});
8168 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8169 {4, 5, 6, 7, 4, 5, 6, 7});
8170 if (Subtarget.hasXOP())
8171 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
8172 LoLo, HiHi, IndicesVec,
8173 DAG.getConstant(0, DL, MVT::i8)));
8174 // Permute Lo and Hi and then select based on index range.
8175 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8176 SDValue Res = DAG.getSelectCC(
8177 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8178 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8179 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8180 ISD::CondCode::SETGT);
8181 return DAG.getBitcast(VT, Res);
8186 if (Subtarget.hasAVX512()) {
8187 if (!Subtarget.hasVLX()) {
8188 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8189 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8191 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8192 DAG, SDLoc(IndicesVec));
8193 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8195 return extract256BitVector(Res, 0, DAG, DL);
8197 Opcode = X86ISD::VPERMV;
8198 } else if (Subtarget.hasAVX()) {
8199 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8201 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8203 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8204 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8205 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8206 if (Subtarget.hasXOP())
8207 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
8208 LoLo, HiHi, IndicesVec,
8209 DAG.getConstant(0, DL, MVT::i8)));
8210 // Permute Lo and Hi and then select based on index range.
8211 // This works as VPERMILPD only uses index bit[1] to permute elements.
8212 SDValue Res = DAG.getSelectCC(
8213 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8214 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8215 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8216 ISD::CondCode::SETGT);
8217 return DAG.getBitcast(VT, Res);
8221 if (Subtarget.hasVBMI())
8222 Opcode = X86ISD::VPERMV;
8225 if (Subtarget.hasBWI())
8226 Opcode = X86ISD::VPERMV;
8232 if (Subtarget.hasAVX512())
8233 Opcode = X86ISD::VPERMV;
8239 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8240 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8241 "Illegal variable permute shuffle type");
8243 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8245 IndicesVec = ScaleIndices(IndicesVec, Scale);
8247 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8248 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8250 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8251 SDValue Res = Opcode == X86ISD::VPERMV
8252 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8253 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8254 return DAG.getBitcast(VT, Res);
8257 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8258 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8259 // (build_vector (extract_elt V, (extract_elt I, 0)),
8260 // (extract_elt V, (extract_elt I, 1)),
8265 // TODO: Handle undefs
8266 // TODO: Utilize pshufb and zero mask blending to support more efficient
8267 // construction of vectors with constant-0 elements.
8269 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
8270 const X86Subtarget &Subtarget) {
8271 SDValue SrcVec, IndicesVec;
8272 // Check for a match of the permute source vector and permute index elements.
8273 // This is done by checking that the i-th build_vector operand is of the form:
8274 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8275 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8276 SDValue Op = V.getOperand(Idx);
8277 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8280 // If this is the first extract encountered in V, set the source vector,
8281 // otherwise verify the extract is from the previously defined source
8284 SrcVec = Op.getOperand(0);
8285 else if (SrcVec != Op.getOperand(0))
8287 SDValue ExtractedIndex = Op->getOperand(1);
8288 // Peek through extends.
8289 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8290 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8291 ExtractedIndex = ExtractedIndex.getOperand(0);
8292 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8295 // If this is the first extract from the index vector candidate, set the
8296 // indices vector, otherwise verify the extract is from the previously
8297 // defined indices vector.
8299 IndicesVec = ExtractedIndex.getOperand(0);
8300 else if (IndicesVec != ExtractedIndex.getOperand(0))
8303 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8304 if (!PermIdx || PermIdx->getZExtValue() != Idx)
8309 MVT VT = V.getSimpleValueType();
8310 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8314 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8317 MVT VT = Op.getSimpleValueType();
8318 MVT EltVT = VT.getVectorElementType();
8319 unsigned NumElems = Op.getNumOperands();
8321 // Generate vectors for predicate vectors.
8322 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8323 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8325 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8326 return VectorConstant;
8328 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8329 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8331 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8332 return HorizontalOp;
8333 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8335 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
8338 unsigned EVTBits = EltVT.getSizeInBits();
8340 unsigned NumZero = 0;
8341 unsigned NumNonZero = 0;
8342 uint64_t NonZeros = 0;
8343 bool IsAllConstants = true;
8344 SmallSet<SDValue, 8> Values;
8345 unsigned NumConstants = NumElems;
8346 for (unsigned i = 0; i < NumElems; ++i) {
8347 SDValue Elt = Op.getOperand(i);
8351 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
8352 IsAllConstants = false;
8355 if (X86::isZeroNode(Elt))
8358 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
8359 NonZeros |= ((uint64_t)1 << i);
8364 // All undef vector. Return an UNDEF. All zero vectors were handled above.
8365 if (NumNonZero == 0)
8366 return DAG.getUNDEF(VT);
8368 // If we are inserting one variable into a vector of non-zero constants, try
8369 // to avoid loading each constant element as a scalar. Load the constants as a
8370 // vector and then insert the variable scalar element. If insertion is not
8371 // supported, we assume that we will fall back to a shuffle to get the scalar
8372 // blended with the constants. Insertion into a zero vector is handled as a
8373 // special-case somewhere below here.
8374 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8375 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8376 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8377 // Create an all-constant vector. The variable element in the old
8378 // build vector is replaced by undef in the constant vector. Save the
8379 // variable scalar element and its index for use in the insertelement.
8380 LLVMContext &Context = *DAG.getContext();
8381 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8382 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8385 for (unsigned i = 0; i != NumElems; ++i) {
8386 SDValue Elt = Op.getOperand(i);
8387 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8388 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8389 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8390 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8391 else if (!Elt.isUndef()) {
8392 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8393 "Expected one variable element in this vector");
8395 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
8398 Constant *CV = ConstantVector::get(ConstVecOps);
8399 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8401 // The constants we just created may not be legal (eg, floating point). We
8402 // must lower the vector right here because we can not guarantee that we'll
8403 // legalize it before loading it. This is also why we could not just create
8404 // a new build vector here. If the build vector contains illegal constants,
8405 // it could get split back up into a series of insert elements.
8406 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8407 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8408 MachineFunction &MF = DAG.getMachineFunction();
8409 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8410 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8411 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8414 // Special case for single non-zero, non-undef, element.
8415 if (NumNonZero == 1) {
8416 unsigned Idx = countTrailingZeros(NonZeros);
8417 SDValue Item = Op.getOperand(Idx);
8419 // If we have a constant or non-constant insertion into the low element of
8420 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8421 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8422 // depending on what the source datatype is.
8425 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8427 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
8428 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
8429 assert((VT.is128BitVector() || VT.is256BitVector() ||
8430 VT.is512BitVector()) &&
8431 "Expected an SSE value type!");
8432 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8433 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8434 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8437 // We can't directly insert an i8 or i16 into a vector, so zero extend
8439 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8440 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8441 if (VT.getSizeInBits() >= 256) {
8442 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8443 if (Subtarget.hasAVX()) {
8444 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8445 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8447 // Without AVX, we need to extend to a 128-bit vector and then
8448 // insert into the 256-bit vector.
8449 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8450 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8451 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8454 assert(VT.is128BitVector() && "Expected an SSE value type!");
8455 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8456 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8458 return DAG.getBitcast(VT, Item);
8462 // Is it a vector logical left shift?
8463 if (NumElems == 2 && Idx == 1 &&
8464 X86::isZeroNode(Op.getOperand(0)) &&
8465 !X86::isZeroNode(Op.getOperand(1))) {
8466 unsigned NumBits = VT.getSizeInBits();
8467 return getVShift(true, VT,
8468 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8469 VT, Op.getOperand(1)),
8470 NumBits/2, DAG, *this, dl);
8473 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8476 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8477 // is a non-constant being inserted into an element other than the low one,
8478 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8479 // movd/movss) to move this into the low element, then shuffle it into
8481 if (EVTBits == 32) {
8482 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8483 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8487 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8488 if (Values.size() == 1) {
8489 if (EVTBits == 32) {
8490 // Instead of a shuffle like this:
8491 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8492 // Check if it's possible to issue this instead.
8493 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8494 unsigned Idx = countTrailingZeros(NonZeros);
8495 SDValue Item = Op.getOperand(Idx);
8496 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8497 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8502 // A vector full of immediates; various special cases are already
8503 // handled, so this is best done with a single constant-pool load.
8507 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8510 // See if we can use a vector load to get all of the elements.
8512 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8514 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8518 // If this is a splat of pairs of 32-bit elements, we can use a narrower
8519 // build_vector and broadcast it.
8520 // TODO: We could probably generalize this more.
8521 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
8522 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8523 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8524 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
8525 // Make sure all the even/odd operands match.
8526 for (unsigned i = 2; i != NumElems; ++i)
8527 if (Ops[i % 2] != Op.getOperand(i))
8531 if (CanSplat(Op, NumElems, Ops)) {
8532 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
8533 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
8534 // Create a new build vector and cast to v2i64/v2f64.
8535 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
8536 DAG.getBuildVector(NarrowVT, dl, Ops));
8537 // Broadcast from v2i64/v2f64 and cast to final VT.
8538 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
8539 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
8544 // For AVX-length vectors, build the individual 128-bit pieces and use
8545 // shuffles to put them in place.
8546 if (VT.getSizeInBits() > 128) {
8547 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
8549 // Build both the lower and upper subvector.
8551 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8552 SDValue Upper = DAG.getBuildVector(
8553 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8555 // Recreate the wider vector with the lower and upper part.
8556 return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
8557 VT.getSizeInBits() / 2);
8560 // Let legalizer expand 2-wide build_vectors.
8561 if (EVTBits == 64) {
8562 if (NumNonZero == 1) {
8563 // One half is zero or undef.
8564 unsigned Idx = countTrailingZeros(NonZeros);
8565 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8566 Op.getOperand(Idx));
8567 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8572 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8573 if (EVTBits == 8 && NumElems == 16)
8574 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8578 if (EVTBits == 16 && NumElems == 8)
8579 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8583 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8584 if (EVTBits == 32 && NumElems == 4)
8585 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8588 // If element VT is == 32 bits, turn it into a number of shuffles.
8589 if (NumElems == 4 && NumZero > 0) {
8590 SmallVector<SDValue, 8> Ops(NumElems);
8591 for (unsigned i = 0; i < 4; ++i) {
8592 bool isZero = !(NonZeros & (1ULL << i));
8594 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8596 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8599 for (unsigned i = 0; i < 2; ++i) {
8600 switch ((NonZeros >> (i*2)) & 0x3) {
8601 default: llvm_unreachable("Unexpected NonZero count");
8603 Ops[i] = Ops[i*2]; // Must be a zero vector.
8606 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8609 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8612 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8617 bool Reverse1 = (NonZeros & 0x3) == 2;
8618 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8622 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8623 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8625 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8628 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8630 // Check for a build vector from mostly shuffle plus few inserting.
8631 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8634 // For SSE 4.1, use insertps to put the high elements into the low element.
8635 if (Subtarget.hasSSE41()) {
8637 if (!Op.getOperand(0).isUndef())
8638 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8640 Result = DAG.getUNDEF(VT);
8642 for (unsigned i = 1; i < NumElems; ++i) {
8643 if (Op.getOperand(i).isUndef()) continue;
8644 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8645 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8650 // Otherwise, expand into a number of unpckl*, start by extending each of
8651 // our (non-undef) elements to the full vector width with the element in the
8652 // bottom slot of the vector (which generates no code for SSE).
8653 SmallVector<SDValue, 8> Ops(NumElems);
8654 for (unsigned i = 0; i < NumElems; ++i) {
8655 if (!Op.getOperand(i).isUndef())
8656 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8658 Ops[i] = DAG.getUNDEF(VT);
8661 // Next, we iteratively mix elements, e.g. for v4f32:
8662 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8663 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8664 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8665 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8666 // Generate scaled UNPCKL shuffle mask.
8667 SmallVector<int, 16> Mask;
8668 for(unsigned i = 0; i != Scale; ++i)
8670 for (unsigned i = 0; i != Scale; ++i)
8671 Mask.push_back(NumElems+i);
8672 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8674 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8675 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8680 // 256-bit AVX can use the vinsertf128 instruction
8681 // to create 256-bit vectors from two other 128-bit ones.
8682 // TODO: Detect subvector broadcast here instead of DAG combine?
8683 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8684 const X86Subtarget &Subtarget) {
8686 MVT ResVT = Op.getSimpleValueType();
8688 assert((ResVT.is256BitVector() ||
8689 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8691 unsigned NumOperands = Op.getNumOperands();
8692 unsigned NumZero = 0;
8693 unsigned NumNonZero = 0;
8694 unsigned NonZeros = 0;
8695 for (unsigned i = 0; i != NumOperands; ++i) {
8696 SDValue SubVec = Op.getOperand(i);
8697 if (SubVec.isUndef())
8699 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8702 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8708 // If we have more than 2 non-zeros, build each half separately.
8709 if (NumNonZero > 2) {
8710 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8711 ResVT.getVectorNumElements()/2);
8712 ArrayRef<SDUse> Ops = Op->ops();
8713 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8714 Ops.slice(0, NumOperands/2));
8715 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8716 Ops.slice(NumOperands/2));
8717 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8720 // Otherwise, build it up through insert_subvectors.
8721 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8722 : DAG.getUNDEF(ResVT);
8724 MVT SubVT = Op.getOperand(0).getSimpleValueType();
8725 unsigned NumSubElems = SubVT.getVectorNumElements();
8726 for (unsigned i = 0; i != NumOperands; ++i) {
8727 if ((NonZeros & (1 << i)) == 0)
8730 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
8732 DAG.getIntPtrConstant(i * NumSubElems, dl));
8738 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8739 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8740 static bool isExpandWithZeros(const SDValue &Op) {
8741 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8742 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8744 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8745 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8751 // Returns true if the given node is a type promotion (by concatenating i1
8752 // zeros) of the result of a node that already zeros all upper bits of
8754 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8755 unsigned Opc = Op.getOpcode();
8757 assert(Opc == ISD::CONCAT_VECTORS &&
8758 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8759 "Unexpected node to check for type promotion!");
8761 // As long as we are concatenating zeros to the upper part of a previous node
8762 // result, climb up the tree until a node with different opcode is
8764 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8765 if (Opc == ISD::INSERT_SUBVECTOR) {
8766 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8767 Op.getConstantOperandVal(2) == 0)
8768 Op = Op.getOperand(1);
8771 } else { // Opc == ISD::CONCAT_VECTORS
8772 if (isExpandWithZeros(Op))
8773 Op = Op.getOperand(0);
8777 Opc = Op.getOpcode();
8780 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8781 // of a node that zeros the upper bits (its masked version).
8782 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8783 (Op.getOpcode() == ISD::AND &&
8784 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8785 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8792 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
8793 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8794 const X86Subtarget &Subtarget,
8795 SelectionDAG & DAG) {
8797 MVT ResVT = Op.getSimpleValueType();
8798 unsigned NumOperands = Op.getNumOperands();
8800 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8801 "Unexpected number of operands in CONCAT_VECTORS");
8803 // If this node promotes - by concatenating zeroes - the type of the result
8804 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8805 // output register, mark it as legal and catch the pattern in instruction
8806 // selection to avoid emitting extra instructions (for zeroing upper bits).
8807 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
8808 return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
8810 unsigned NumZero = 0;
8811 unsigned NumNonZero = 0;
8812 uint64_t NonZeros = 0;
8813 for (unsigned i = 0; i != NumOperands; ++i) {
8814 SDValue SubVec = Op.getOperand(i);
8815 if (SubVec.isUndef())
8817 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8820 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8821 NonZeros |= (uint64_t)1 << i;
8827 // If there are zero or one non-zeros we can handle this very simply.
8828 if (NumNonZero <= 1) {
8829 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8830 : DAG.getUNDEF(ResVT);
8833 unsigned Idx = countTrailingZeros(NonZeros);
8834 SDValue SubVec = Op.getOperand(Idx);
8835 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8836 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8837 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8840 if (NumOperands > 2) {
8841 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8842 ResVT.getVectorNumElements()/2);
8843 ArrayRef<SDUse> Ops = Op->ops();
8844 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8845 Ops.slice(0, NumOperands/2));
8846 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8847 Ops.slice(NumOperands/2));
8848 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8851 assert(NumNonZero == 2 && "Simple cases not handled?");
8853 if (ResVT.getVectorNumElements() >= 16)
8854 return Op; // The operation is legal with KUNPCK
8856 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8857 DAG.getUNDEF(ResVT), Op.getOperand(0),
8858 DAG.getIntPtrConstant(0, dl));
8859 unsigned NumElems = ResVT.getVectorNumElements();
8860 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8861 DAG.getIntPtrConstant(NumElems/2, dl));
8864 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8865 const X86Subtarget &Subtarget,
8866 SelectionDAG &DAG) {
8867 MVT VT = Op.getSimpleValueType();
8868 if (VT.getVectorElementType() == MVT::i1)
8869 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8871 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8872 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8873 Op.getNumOperands() == 4)));
8875 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8876 // from two other 128-bit ones.
8878 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8879 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
8882 //===----------------------------------------------------------------------===//
8883 // Vector shuffle lowering
8885 // This is an experimental code path for lowering vector shuffles on x86. It is
8886 // designed to handle arbitrary vector shuffles and blends, gracefully
8887 // degrading performance as necessary. It works hard to recognize idiomatic
8888 // shuffles and lower them to optimal instruction patterns without leaving
8889 // a framework that allows reasonably efficient handling of all vector shuffle
8891 //===----------------------------------------------------------------------===//
8893 /// Tiny helper function to identify a no-op mask.
8895 /// This is a somewhat boring predicate function. It checks whether the mask
8896 /// array input, which is assumed to be a single-input shuffle mask of the kind
8897 /// used by the X86 shuffle instructions (not a fully general
8898 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8899 /// in-place shuffle are 'no-op's.
8900 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8901 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8902 assert(Mask[i] >= -1 && "Out of bound mask element!");
8903 if (Mask[i] >= 0 && Mask[i] != i)
8909 /// Test whether there are elements crossing 128-bit lanes in this
8912 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8913 /// and we routinely test for these.
8914 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8915 int LaneSize = 128 / VT.getScalarSizeInBits();
8916 int Size = Mask.size();
8917 for (int i = 0; i < Size; ++i)
8918 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8923 /// Test whether a shuffle mask is equivalent within each sub-lane.
8925 /// This checks a shuffle mask to see if it is performing the same
8926 /// lane-relative shuffle in each sub-lane. This trivially implies
8927 /// that it is also not lane-crossing. It may however involve a blend from the
8928 /// same lane of a second vector.
8930 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8931 /// non-trivial to compute in the face of undef lanes. The representation is
8932 /// suitable for use with existing 128-bit shuffles as entries from the second
8933 /// vector have been remapped to [LaneSize, 2*LaneSize).
8934 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8936 SmallVectorImpl<int> &RepeatedMask) {
8937 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8938 RepeatedMask.assign(LaneSize, -1);
8939 int Size = Mask.size();
8940 for (int i = 0; i < Size; ++i) {
8941 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8944 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8945 // This entry crosses lanes, so there is no way to model this shuffle.
8948 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8949 // Adjust second vector indices to start at LaneSize instead of Size.
8950 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8951 : Mask[i] % LaneSize + LaneSize;
8952 if (RepeatedMask[i % LaneSize] < 0)
8953 // This is the first non-undef entry in this slot of a 128-bit lane.
8954 RepeatedMask[i % LaneSize] = LocalM;
8955 else if (RepeatedMask[i % LaneSize] != LocalM)
8956 // Found a mismatch with the repeated mask.
8962 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8964 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8965 SmallVectorImpl<int> &RepeatedMask) {
8966 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8970 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
8971 SmallVector<int, 32> RepeatedMask;
8972 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8975 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8977 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8978 SmallVectorImpl<int> &RepeatedMask) {
8979 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8982 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8983 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8984 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8986 SmallVectorImpl<int> &RepeatedMask) {
8987 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8988 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8989 int Size = Mask.size();
8990 for (int i = 0; i < Size; ++i) {
8991 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8992 if (Mask[i] == SM_SentinelUndef)
8994 if (Mask[i] == SM_SentinelZero) {
8995 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8997 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9000 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9001 // This entry crosses lanes, so there is no way to model this shuffle.
9004 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9005 // Adjust second vector indices to start at LaneSize instead of Size.
9007 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
9008 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9009 // This is the first non-undef entry in this slot of a 128-bit lane.
9010 RepeatedMask[i % LaneSize] = LocalM;
9011 else if (RepeatedMask[i % LaneSize] != LocalM)
9012 // Found a mismatch with the repeated mask.
9018 /// Checks whether a shuffle mask is equivalent to an explicit list of
9021 /// This is a fast way to test a shuffle mask against a fixed pattern:
9023 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9025 /// It returns true if the mask is exactly as wide as the argument list, and
9026 /// each element of the mask is either -1 (signifying undef) or the value given
9027 /// in the argument.
9028 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
9029 ArrayRef<int> ExpectedMask) {
9030 if (Mask.size() != ExpectedMask.size())
9033 int Size = Mask.size();
9035 // If the values are build vectors, we can look through them to find
9036 // equivalent inputs that make the shuffles equivalent.
9037 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
9038 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
9040 for (int i = 0; i < Size; ++i) {
9041 assert(Mask[i] >= -1 && "Out of bound mask element!");
9042 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
9043 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
9044 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
9045 if (!MaskBV || !ExpectedBV ||
9046 MaskBV->getOperand(Mask[i] % Size) !=
9047 ExpectedBV->getOperand(ExpectedMask[i] % Size))
9055 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9057 /// The masks must be exactly the same width.
9059 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9060 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
9062 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
9063 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
9064 ArrayRef<int> ExpectedMask) {
9065 int Size = Mask.size();
9066 if (Size != (int)ExpectedMask.size())
9069 for (int i = 0; i < Size; ++i)
9070 if (Mask[i] == SM_SentinelUndef)
9072 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
9074 else if (Mask[i] != ExpectedMask[i])
9080 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
9082 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
9083 const APInt &Zeroable) {
9084 int NumElts = Mask.size();
9085 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
9087 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
9088 for (int i = 0; i != NumElts; ++i) {
9090 if (M == SM_SentinelUndef)
9092 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
9093 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
9098 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9100 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
9101 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9104 SmallVector<int, 8> Unpcklwd;
9105 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9106 /* Unary = */ false);
9107 SmallVector<int, 8> Unpckhwd;
9108 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9109 /* Unary = */ false);
9110 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
9111 isTargetShuffleEquivalent(Mask, Unpckhwd));
9112 return IsUnpackwdMask;
9115 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9117 /// This helper function produces an 8-bit shuffle immediate corresponding to
9118 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
9119 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9122 /// NB: We rely heavily on "undef" masks preserving the input lane.
9123 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9124 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9125 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9126 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9127 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9128 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9131 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9132 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9133 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9134 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9138 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
9139 SelectionDAG &DAG) {
9140 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9143 /// Compute whether each element of a shuffle is zeroable.
9145 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
9146 /// Either it is an undef element in the shuffle mask, the element of the input
9147 /// referenced is undef, or the element of the input referenced is known to be
9148 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
9149 /// as many lanes with this technique as possible to simplify the remaining
9151 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
9152 SDValue V1, SDValue V2) {
9153 APInt Zeroable(Mask.size(), 0);
9154 V1 = peekThroughBitcasts(V1);
9155 V2 = peekThroughBitcasts(V2);
9157 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
9158 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
9160 int VectorSizeInBits = V1.getValueSizeInBits();
9161 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
9162 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
9164 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9166 // Handle the easy cases.
9167 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
9172 // Determine shuffle input and normalize the mask.
9173 SDValue V = M < Size ? V1 : V2;
9176 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
9177 if (V.getOpcode() != ISD::BUILD_VECTOR)
9180 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
9181 // the (larger) source element must be UNDEF/ZERO.
9182 if ((Size % V.getNumOperands()) == 0) {
9183 int Scale = Size / V->getNumOperands();
9184 SDValue Op = V.getOperand(M / Scale);
9185 if (Op.isUndef() || X86::isZeroNode(Op))
9187 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
9188 APInt Val = Cst->getAPIntValue();
9189 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9190 Val = Val.getLoBits(ScalarSizeInBits);
9193 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
9194 APInt Val = Cst->getValueAPF().bitcastToAPInt();
9195 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9196 Val = Val.getLoBits(ScalarSizeInBits);
9203 // If the BUILD_VECTOR has more elements then all the (smaller) source
9204 // elements must be UNDEF or ZERO.
9205 if ((V.getNumOperands() % Size) == 0) {
9206 int Scale = V->getNumOperands() / Size;
9207 bool AllZeroable = true;
9208 for (int j = 0; j < Scale; ++j) {
9209 SDValue Op = V.getOperand((M * Scale) + j);
9210 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
9221 // The Shuffle result is as follow:
9222 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9223 // Each Zeroable's element correspond to a particular Mask's element.
9224 // As described in computeZeroableShuffleElements function.
9226 // The function looks for a sub-mask that the nonzero elements are in
9227 // increasing order. If such sub-mask exist. The function returns true.
9228 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9229 ArrayRef<int> Mask, const EVT &VectorType,
9230 bool &IsZeroSideLeft) {
9231 int NextElement = -1;
9232 // Check if the Mask's nonzero elements are in increasing order.
9233 for (int i = 0, e = Mask.size(); i < e; i++) {
9234 // Checks if the mask's zeros elements are built from only zeros.
9235 assert(Mask[i] >= -1 && "Out of bound mask element!");
9240 // Find the lowest non zero element
9241 if (NextElement < 0) {
9242 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9243 IsZeroSideLeft = NextElement != 0;
9245 // Exit if the mask's non zero elements are not in increasing order.
9246 if (NextElement != Mask[i])
9253 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9254 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
9255 ArrayRef<int> Mask, SDValue V1,
9257 const APInt &Zeroable,
9258 const X86Subtarget &Subtarget,
9259 SelectionDAG &DAG) {
9260 int Size = Mask.size();
9261 int LaneSize = 128 / VT.getScalarSizeInBits();
9262 const int NumBytes = VT.getSizeInBits() / 8;
9263 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9265 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9266 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9267 (Subtarget.hasBWI() && VT.is512BitVector()));
9269 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9270 // Sign bit set in i8 mask means zero element.
9271 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9274 for (int i = 0; i < NumBytes; ++i) {
9275 int M = Mask[i / NumEltBytes];
9277 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9280 if (Zeroable[i / NumEltBytes]) {
9281 PSHUFBMask[i] = ZeroMask;
9285 // We can only use a single input of V1 or V2.
9286 SDValue SrcV = (M >= Size ? V2 : V1);
9292 // PSHUFB can't cross lanes, ensure this doesn't happen.
9293 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9297 M = M * NumEltBytes + (i % NumEltBytes);
9298 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9300 assert(V && "Failed to find a source input");
9302 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9303 return DAG.getBitcast(
9304 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9305 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9308 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9309 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9312 // X86 has dedicated shuffle that can be lowered to VEXPAND
9313 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
9314 const APInt &Zeroable,
9315 ArrayRef<int> Mask, SDValue &V1,
9316 SDValue &V2, SelectionDAG &DAG,
9317 const X86Subtarget &Subtarget) {
9318 bool IsLeftZeroSide = true;
9319 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9322 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9324 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9325 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9326 unsigned NumElts = VT.getVectorNumElements();
9327 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9328 "Unexpected number of vector elements");
9329 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9330 Subtarget, DAG, DL);
9331 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9332 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9333 return DAG.getSelect(DL, VT, VMask,
9334 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
9338 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9339 unsigned &UnpackOpcode, bool IsUnary,
9340 ArrayRef<int> TargetMask,
9341 const SDLoc &DL, SelectionDAG &DAG,
9342 const X86Subtarget &Subtarget) {
9343 int NumElts = VT.getVectorNumElements();
9345 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9346 for (int i = 0; i != NumElts; i += 2) {
9347 int M1 = TargetMask[i + 0];
9348 int M2 = TargetMask[i + 1];
9349 Undef1 &= (SM_SentinelUndef == M1);
9350 Undef2 &= (SM_SentinelUndef == M2);
9351 Zero1 &= isUndefOrZero(M1);
9352 Zero2 &= isUndefOrZero(M2);
9354 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9355 "Zeroable shuffle detected");
9357 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9358 SmallVector<int, 64> Unpckl, Unpckh;
9359 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9360 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9361 UnpackOpcode = X86ISD::UNPCKL;
9362 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9363 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9367 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9368 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9369 UnpackOpcode = X86ISD::UNPCKH;
9370 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9371 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9375 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9376 if (IsUnary && (Zero1 || Zero2)) {
9377 // Don't bother if we can blend instead.
9378 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9379 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9382 bool MatchLo = true, MatchHi = true;
9383 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9384 int M = TargetMask[i];
9386 // Ignore if the input is known to be zero or the index is undef.
9387 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9388 (M == SM_SentinelUndef))
9391 MatchLo &= (M == Unpckl[i]);
9392 MatchHi &= (M == Unpckh[i]);
9395 if (MatchLo || MatchHi) {
9396 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9397 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9398 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9403 // If a binary shuffle, commute and try again.
9405 ShuffleVectorSDNode::commuteMask(Unpckl);
9406 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9407 UnpackOpcode = X86ISD::UNPCKL;
9412 ShuffleVectorSDNode::commuteMask(Unpckh);
9413 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9414 UnpackOpcode = X86ISD::UNPCKH;
9423 // X86 has dedicated unpack instructions that can handle specific blend
9424 // operations: UNPCKH and UNPCKL.
9425 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
9426 ArrayRef<int> Mask, SDValue V1,
9427 SDValue V2, SelectionDAG &DAG) {
9428 SmallVector<int, 8> Unpckl;
9429 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9430 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9431 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9433 SmallVector<int, 8> Unpckh;
9434 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9435 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9436 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9438 // Commute and try again.
9439 ShuffleVectorSDNode::commuteMask(Unpckl);
9440 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9441 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9443 ShuffleVectorSDNode::commuteMask(Unpckh);
9444 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9445 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9450 static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
9452 int Size = (int)Mask.size();
9453 int Split = Size / Delta;
9454 int TruncatedVectorStart = SwappedOps ? Size : 0;
9456 // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
9457 if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
9460 // The rest of the mask should not refer to the truncated vector's elements.
9461 if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
9462 TruncatedVectorStart + Size))
9468 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
9470 // An example is the following:
9472 // t0: ch = EntryToken
9473 // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
9474 // t25: v4i32 = truncate t2
9475 // t41: v8i16 = bitcast t25
9476 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
9477 // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
9478 // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
9479 // t18: v2i64 = bitcast t51
9481 // Without avx512vl, this is lowered to:
9483 // vpmovqd %zmm0, %ymm0
9484 // vpshufb {{.*#+}} xmm0 =
9485 // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
9487 // But when avx512vl is available, one can just use a single vpmovdw
9489 static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
9490 MVT VT, SDValue V1, SDValue V2,
9492 const X86Subtarget &Subtarget) {
9493 if (VT != MVT::v16i8 && VT != MVT::v8i16)
9496 if (Mask.size() != VT.getVectorNumElements())
9499 bool SwappedOps = false;
9501 if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
9502 if (!ISD::isBuildVectorAllZeros(V1.getNode()))
9511 // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
9512 // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
9514 // and similar ones.
9515 if (V1.getOpcode() != ISD::BITCAST)
9517 if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
9520 SDValue Src = V1.getOperand(0).getOperand(0);
9521 MVT SrcVT = Src.getSimpleValueType();
9523 // The vptrunc** instructions truncating 128 bit and 256 bit vectors
9524 // are only available with avx512vl.
9525 if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
9528 // Down Convert Word to Byte is only available with avx512bw. The case with
9529 // 256-bit output doesn't contain a shuffle and is therefore not handled here.
9530 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
9531 !Subtarget.hasBWI())
9534 // The first half/quarter of the mask should refer to every second/fourth
9535 // element of the vector truncated and bitcasted.
9536 if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
9537 !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
9540 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
9543 // X86 has dedicated pack instructions that can handle specific truncation
9544 // operations: PACKSS and PACKUS.
9545 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
9546 SDValue &V2, unsigned &PackOpcode,
9547 ArrayRef<int> TargetMask,
9549 const X86Subtarget &Subtarget) {
9550 unsigned NumElts = VT.getVectorNumElements();
9551 unsigned BitSize = VT.getScalarSizeInBits();
9552 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
9553 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
9555 auto MatchPACK = [&](SDValue N1, SDValue N2) {
9556 SDValue VV1 = DAG.getBitcast(PackVT, N1);
9557 SDValue VV2 = DAG.getBitcast(PackVT, N2);
9558 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9559 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9560 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9561 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9565 PackOpcode = X86ISD::PACKUS;
9569 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
9570 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9574 PackOpcode = X86ISD::PACKSS;
9580 // Try binary shuffle.
9581 SmallVector<int, 32> BinaryMask;
9582 createPackShuffleMask(VT, BinaryMask, false);
9583 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9584 if (MatchPACK(V1, V2))
9587 // Try unary shuffle.
9588 SmallVector<int, 32> UnaryMask;
9589 createPackShuffleMask(VT, UnaryMask, true);
9590 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9591 if (MatchPACK(V1, V1))
9597 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9598 ArrayRef<int> Mask, SDValue V1,
9599 SDValue V2, SelectionDAG &DAG,
9600 const X86Subtarget &Subtarget) {
9602 unsigned PackOpcode;
9603 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9605 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9606 DAG.getBitcast(PackVT, V2));
9611 /// Try to emit a bitmask instruction for a shuffle.
9613 /// This handles cases where we can model a blend exactly as a bitmask due to
9614 /// one of the inputs being zeroable.
9615 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9616 SDValue V2, ArrayRef<int> Mask,
9617 const APInt &Zeroable,
9618 SelectionDAG &DAG) {
9619 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9620 MVT EltVT = VT.getVectorElementType();
9621 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9622 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9623 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9625 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9628 if (Mask[i] % Size != i)
9629 return SDValue(); // Not a blend.
9631 V = Mask[i] < Size ? V1 : V2;
9632 else if (V != (Mask[i] < Size ? V1 : V2))
9633 return SDValue(); // Can only let one input through the mask.
9635 VMaskOps[i] = AllOnes;
9638 return SDValue(); // No non-zeroable elements!
9640 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9641 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9644 /// Try to emit a blend instruction for a shuffle using bit math.
9646 /// This is used as a fallback approach when first class blend instructions are
9647 /// unavailable. Currently it is only suitable for integer vectors, but could
9648 /// be generalized for floating point vectors if desirable.
9649 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9650 SDValue V2, ArrayRef<int> Mask,
9651 SelectionDAG &DAG) {
9652 assert(VT.isInteger() && "Only supports integer vector types!");
9653 MVT EltVT = VT.getVectorElementType();
9654 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9655 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9656 SmallVector<SDValue, 16> MaskOps;
9657 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9658 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9659 return SDValue(); // Shuffled input!
9660 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9663 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9664 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9665 // We have to cast V2 around.
9666 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9667 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9668 DAG.getBitcast(MaskVT, V1Mask),
9669 DAG.getBitcast(MaskVT, V2)));
9670 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9673 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9674 SDValue PreservedSrc,
9675 const X86Subtarget &Subtarget,
9678 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9679 MutableArrayRef<int> TargetMask,
9680 bool &ForceV1Zero, bool &ForceV2Zero,
9681 uint64_t &BlendMask) {
9682 bool V1IsZeroOrUndef =
9683 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9684 bool V2IsZeroOrUndef =
9685 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9688 ForceV1Zero = false, ForceV2Zero = false;
9689 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9691 // Attempt to generate the binary blend mask. If an input is zero then
9692 // we can use any lane.
9693 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9694 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9695 int M = TargetMask[i];
9696 if (M == SM_SentinelUndef)
9700 if (M == i + Size) {
9701 BlendMask |= 1ull << i;
9704 if (M == SM_SentinelZero) {
9705 if (V1IsZeroOrUndef) {
9710 if (V2IsZeroOrUndef) {
9712 BlendMask |= 1ull << i;
9713 TargetMask[i] = i + Size;
9722 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9724 uint64_t ScaledMask = 0;
9725 for (int i = 0; i != Size; ++i)
9726 if (BlendMask & (1ull << i))
9727 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9731 /// Try to emit a blend instruction for a shuffle.
9733 /// This doesn't do any checks for the availability of instructions for blending
9734 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9735 /// be matched in the backend with the type given. What it does check for is
9736 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9737 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9738 SDValue V2, ArrayRef<int> Original,
9739 const APInt &Zeroable,
9740 const X86Subtarget &Subtarget,
9741 SelectionDAG &DAG) {
9742 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9744 uint64_t BlendMask = 0;
9745 bool ForceV1Zero = false, ForceV2Zero = false;
9746 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9750 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9752 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9754 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9756 switch (VT.SimpleTy) {
9761 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9762 DAG.getConstant(BlendMask, DL, MVT::i8));
9766 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9770 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9771 // that instruction.
9772 if (Subtarget.hasAVX2()) {
9773 // Scale the blend by the number of 32-bit dwords per element.
9774 int Scale = VT.getScalarSizeInBits() / 32;
9775 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9776 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9777 V1 = DAG.getBitcast(BlendVT, V1);
9778 V2 = DAG.getBitcast(BlendVT, V2);
9779 return DAG.getBitcast(
9780 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9781 DAG.getConstant(BlendMask, DL, MVT::i8)));
9785 // For integer shuffles we need to expand the mask and cast the inputs to
9786 // v8i16s prior to blending.
9787 int Scale = 8 / VT.getVectorNumElements();
9788 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9789 V1 = DAG.getBitcast(MVT::v8i16, V1);
9790 V2 = DAG.getBitcast(MVT::v8i16, V2);
9791 return DAG.getBitcast(VT,
9792 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9793 DAG.getConstant(BlendMask, DL, MVT::i8)));
9797 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9798 SmallVector<int, 8> RepeatedMask;
9799 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9800 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9801 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9803 for (int i = 0; i < 8; ++i)
9804 if (RepeatedMask[i] >= 8)
9805 BlendMask |= 1ull << i;
9806 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9807 DAG.getConstant(BlendMask, DL, MVT::i8));
9813 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9814 "256-bit byte-blends require AVX2 support!");
9816 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9818 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9819 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9820 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9823 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9824 if (SDValue Masked =
9825 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9828 // Scale the blend by the number of bytes per element.
9829 int Scale = VT.getScalarSizeInBits() / 8;
9831 // This form of blend is always done on bytes. Compute the byte vector
9833 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9835 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9836 // mix of LLVM's code generator and the x86 backend. We tell the code
9837 // generator that boolean values in the elements of an x86 vector register
9838 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9839 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9840 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9841 // of the element (the remaining are ignored) and 0 in that high bit would
9842 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9843 // the LLVM model for boolean values in vector elements gets the relevant
9844 // bit set, it is set backwards and over constrained relative to x86's
9846 SmallVector<SDValue, 32> VSELECTMask;
9847 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9848 for (int j = 0; j < Scale; ++j)
9849 VSELECTMask.push_back(
9850 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9851 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9854 V1 = DAG.getBitcast(BlendVT, V1);
9855 V2 = DAG.getBitcast(BlendVT, V2);
9856 return DAG.getBitcast(
9858 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9868 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9869 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9870 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9873 llvm_unreachable("Not a supported integer vector type!");
9877 /// Try to lower as a blend of elements from two inputs followed by
9878 /// a single-input permutation.
9880 /// This matches the pattern where we can blend elements from two inputs and
9881 /// then reduce the shuffle to a single-input permutation.
9882 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9883 SDValue V1, SDValue V2,
9885 SelectionDAG &DAG) {
9886 // We build up the blend mask while checking whether a blend is a viable way
9887 // to reduce the shuffle.
9888 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9889 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9891 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9895 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9897 if (BlendMask[Mask[i] % Size] < 0)
9898 BlendMask[Mask[i] % Size] = Mask[i];
9899 else if (BlendMask[Mask[i] % Size] != Mask[i])
9900 return SDValue(); // Can't blend in the needed input!
9902 PermuteMask[i] = Mask[i] % Size;
9905 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9906 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9909 /// Generic routine to decompose a shuffle and blend into independent
9910 /// blends and permutes.
9912 /// This matches the extremely common pattern for handling combined
9913 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9914 /// operations. It will try to pick the best arrangement of shuffles and
9916 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9920 SelectionDAG &DAG) {
9921 // Shuffle the input elements into the desired positions in V1 and V2 and
9922 // blend them together.
9923 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9924 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9925 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9926 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9927 if (Mask[i] >= 0 && Mask[i] < Size) {
9928 V1Mask[i] = Mask[i];
9930 } else if (Mask[i] >= Size) {
9931 V2Mask[i] = Mask[i] - Size;
9932 BlendMask[i] = i + Size;
9935 // Try to lower with the simpler initial blend strategy unless one of the
9936 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9937 // shuffle may be able to fold with a load or other benefit. However, when
9938 // we'll have to do 2x as many shuffles in order to achieve this, blending
9939 // first is a better strategy.
9940 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9941 if (SDValue BlendPerm =
9942 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9945 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9946 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9947 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9950 /// Try to lower a vector shuffle as a rotation.
9952 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9953 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9954 ArrayRef<int> Mask) {
9955 int NumElts = Mask.size();
9957 // We need to detect various ways of spelling a rotation:
9958 // [11, 12, 13, 14, 15, 0, 1, 2]
9959 // [-1, 12, 13, 14, -1, -1, 1, -1]
9960 // [-1, -1, -1, -1, -1, -1, 1, 2]
9961 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9962 // [-1, 4, 5, 6, -1, -1, 9, -1]
9963 // [-1, 4, 5, 6, -1, -1, -1, -1]
9966 for (int i = 0; i < NumElts; ++i) {
9968 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9969 "Unexpected mask index.");
9973 // Determine where a rotated vector would have started.
9974 int StartIdx = i - (M % NumElts);
9976 // The identity rotation isn't interesting, stop.
9979 // If we found the tail of a vector the rotation must be the missing
9980 // front. If we found the head of a vector, it must be how much of the
9982 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9985 Rotation = CandidateRotation;
9986 else if (Rotation != CandidateRotation)
9987 // The rotations don't match, so we can't match this mask.
9990 // Compute which value this mask is pointing at.
9991 SDValue MaskV = M < NumElts ? V1 : V2;
9993 // Compute which of the two target values this index should be assigned
9994 // to. This reflects whether the high elements are remaining or the low
9995 // elements are remaining.
9996 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9998 // Either set up this value if we've not encountered it before, or check
9999 // that it remains consistent.
10002 else if (TargetV != MaskV)
10003 // This may be a rotation, but it pulls from the inputs in some
10004 // unsupported interleaving.
10008 // Check that we successfully analyzed the mask, and normalize the results.
10009 assert(Rotation != 0 && "Failed to locate a viable rotation!");
10010 assert((Lo || Hi) && "Failed to find a rotated input vector!");
10022 /// Try to lower a vector shuffle as a byte rotation.
10024 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
10025 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
10026 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
10027 /// try to generically lower a vector shuffle through such an pattern. It
10028 /// does not check for the profitability of lowering either as PALIGNR or
10029 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
10030 /// This matches shuffle vectors that look like:
10032 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
10034 /// Essentially it concatenates V1 and V2, shifts right by some number of
10035 /// elements, and takes the low elements as the result. Note that while this is
10036 /// specified as a *right shift* because x86 is little-endian, it is a *left
10037 /// rotate* of the vector lanes.
10038 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
10039 ArrayRef<int> Mask) {
10040 // Don't accept any shuffles with zero elements.
10041 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
10044 // PALIGNR works on 128-bit lanes.
10045 SmallVector<int, 16> RepeatedMask;
10046 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
10049 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
10053 // PALIGNR rotates bytes, so we need to scale the
10054 // rotation based on how many bytes are in the vector lane.
10055 int NumElts = RepeatedMask.size();
10056 int Scale = 16 / NumElts;
10057 return Rotation * Scale;
10060 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
10061 SDValue V1, SDValue V2,
10062 ArrayRef<int> Mask,
10063 const X86Subtarget &Subtarget,
10064 SelectionDAG &DAG) {
10065 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
10067 SDValue Lo = V1, Hi = V2;
10068 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
10069 if (ByteRotation <= 0)
10072 // Cast the inputs to i8 vector of correct length to match PALIGNR or
10074 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10075 Lo = DAG.getBitcast(ByteVT, Lo);
10076 Hi = DAG.getBitcast(ByteVT, Hi);
10078 // SSSE3 targets can use the palignr instruction.
10079 if (Subtarget.hasSSSE3()) {
10080 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
10081 "512-bit PALIGNR requires BWI instructions");
10082 return DAG.getBitcast(
10083 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
10084 DAG.getConstant(ByteRotation, DL, MVT::i8)));
10087 assert(VT.is128BitVector() &&
10088 "Rotate-based lowering only supports 128-bit lowering!");
10089 assert(Mask.size() <= 16 &&
10090 "Can shuffle at most 16 bytes in a 128-bit vector!");
10091 assert(ByteVT == MVT::v16i8 &&
10092 "SSE2 rotate lowering only needed for v16i8!");
10094 // Default SSE2 implementation
10095 int LoByteShift = 16 - ByteRotation;
10096 int HiByteShift = ByteRotation;
10098 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
10099 DAG.getConstant(LoByteShift, DL, MVT::i8));
10100 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
10101 DAG.getConstant(HiByteShift, DL, MVT::i8));
10102 return DAG.getBitcast(VT,
10103 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
10106 /// Try to lower a vector shuffle as a dword/qword rotation.
10108 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
10109 /// rotation of the concatenation of two vectors; This routine will
10110 /// try to generically lower a vector shuffle through such an pattern.
10112 /// Essentially it concatenates V1 and V2, shifts right by some number of
10113 /// elements, and takes the low elements as the result. Note that while this is
10114 /// specified as a *right shift* because x86 is little-endian, it is a *left
10115 /// rotate* of the vector lanes.
10116 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
10117 SDValue V1, SDValue V2,
10118 ArrayRef<int> Mask,
10119 const X86Subtarget &Subtarget,
10120 SelectionDAG &DAG) {
10121 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
10122 "Only 32-bit and 64-bit elements are supported!");
10124 // 128/256-bit vectors are only supported with VLX.
10125 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
10126 && "VLX required for 128/256-bit vectors");
10128 SDValue Lo = V1, Hi = V2;
10129 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
10133 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
10134 DAG.getConstant(Rotation, DL, MVT::i8));
10137 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
10139 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
10140 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
10141 /// matches elements from one of the input vectors shuffled to the left or
10142 /// right with zeroable elements 'shifted in'. It handles both the strictly
10143 /// bit-wise element shifts and the byte shift across an entire 128-bit double
10144 /// quad word lane.
10146 /// PSHL : (little-endian) left bit shift.
10147 /// [ zz, 0, zz, 2 ]
10148 /// [ -1, 4, zz, -1 ]
10149 /// PSRL : (little-endian) right bit shift.
10150 /// [ 1, zz, 3, zz]
10151 /// [ -1, -1, 7, zz]
10152 /// PSLLDQ : (little-endian) left byte shift
10153 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
10154 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
10155 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
10156 /// PSRLDQ : (little-endian) right byte shift
10157 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
10158 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
10159 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
10160 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
10161 unsigned ScalarSizeInBits,
10162 ArrayRef<int> Mask, int MaskOffset,
10163 const APInt &Zeroable,
10164 const X86Subtarget &Subtarget) {
10165 int Size = Mask.size();
10166 unsigned SizeInBits = Size * ScalarSizeInBits;
10168 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
10169 for (int i = 0; i < Size; i += Scale)
10170 for (int j = 0; j < Shift; ++j)
10171 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
10177 auto MatchShift = [&](int Shift, int Scale, bool Left) {
10178 for (int i = 0; i != Size; i += Scale) {
10179 unsigned Pos = Left ? i + Shift : i;
10180 unsigned Low = Left ? i : i + Shift;
10181 unsigned Len = Scale - Shift;
10182 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
10186 int ShiftEltBits = ScalarSizeInBits * Scale;
10187 bool ByteShift = ShiftEltBits > 64;
10188 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
10189 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
10190 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
10192 // Normalize the scale for byte shifts to still produce an i64 element
10194 Scale = ByteShift ? Scale / 2 : Scale;
10196 // We need to round trip through the appropriate type for the shift.
10197 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
10198 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
10199 : MVT::getVectorVT(ShiftSVT, Size / Scale);
10200 return (int)ShiftAmt;
10203 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
10204 // keep doubling the size of the integer elements up to that. We can
10205 // then shift the elements of the integer vector by whole multiples of
10206 // their width within the elements of the larger integer vector. Test each
10207 // multiple to see if we can find a match with the moved element indices
10208 // and that the shifted in elements are all zeroable.
10209 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
10210 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
10211 for (int Shift = 1; Shift != Scale; ++Shift)
10212 for (bool Left : {true, false})
10213 if (CheckZeros(Shift, Scale, Left)) {
10214 int ShiftAmt = MatchShift(Shift, Scale, Left);
10223 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
10224 SDValue V2, ArrayRef<int> Mask,
10225 const APInt &Zeroable,
10226 const X86Subtarget &Subtarget,
10227 SelectionDAG &DAG) {
10228 int Size = Mask.size();
10229 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10235 // Try to match shuffle against V1 shift.
10236 int ShiftAmt = matchVectorShuffleAsShift(
10237 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
10239 // If V1 failed, try to match shuffle against V2 shift.
10240 if (ShiftAmt < 0) {
10242 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
10243 Mask, Size, Zeroable, Subtarget);
10250 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
10251 "Illegal integer vector type");
10252 V = DAG.getBitcast(ShiftVT, V);
10253 V = DAG.getNode(Opcode, DL, ShiftVT, V,
10254 DAG.getConstant(ShiftAmt, DL, MVT::i8));
10255 return DAG.getBitcast(VT, V);
10258 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
10259 // Remainder of lower half result is zero and upper half is all undef.
10260 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
10261 ArrayRef<int> Mask, uint64_t &BitLen,
10262 uint64_t &BitIdx, const APInt &Zeroable) {
10263 int Size = Mask.size();
10264 int HalfSize = Size / 2;
10265 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10266 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
10268 // Upper half must be undefined.
10269 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10272 // Determine the extraction length from the part of the
10273 // lower half that isn't zeroable.
10274 int Len = HalfSize;
10275 for (; Len > 0; --Len)
10276 if (!Zeroable[Len - 1])
10278 assert(Len > 0 && "Zeroable shuffle mask");
10280 // Attempt to match first Len sequential elements from the lower half.
10283 for (int i = 0; i != Len; ++i) {
10285 if (M == SM_SentinelUndef)
10287 SDValue &V = (M < Size ? V1 : V2);
10290 // The extracted elements must start at a valid index and all mask
10291 // elements must be in the lower half.
10292 if (i > M || M >= HalfSize)
10295 if (Idx < 0 || (Src == V && Idx == (M - i))) {
10303 if (!Src || Idx < 0)
10306 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
10307 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10308 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10313 // INSERTQ: Extract lowest Len elements from lower half of second source and
10314 // insert over first source, starting at Idx.
10315 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
10316 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
10317 ArrayRef<int> Mask, uint64_t &BitLen,
10318 uint64_t &BitIdx) {
10319 int Size = Mask.size();
10320 int HalfSize = Size / 2;
10321 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10323 // Upper half must be undefined.
10324 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10327 for (int Idx = 0; Idx != HalfSize; ++Idx) {
10330 // Attempt to match first source from mask before insertion point.
10331 if (isUndefInRange(Mask, 0, Idx)) {
10333 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
10335 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
10341 // Extend the extraction length looking to match both the insertion of
10342 // the second source and the remaining elements of the first.
10343 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
10345 int Len = Hi - Idx;
10347 // Match insertion.
10348 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
10350 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
10356 // Match the remaining elements of the lower half.
10357 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
10359 } else if ((!Base || (Base == V1)) &&
10360 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
10362 } else if ((!Base || (Base == V2)) &&
10363 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
10370 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10371 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10381 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
10382 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
10383 SDValue V2, ArrayRef<int> Mask,
10384 const APInt &Zeroable,
10385 SelectionDAG &DAG) {
10386 uint64_t BitLen, BitIdx;
10387 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
10388 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
10389 DAG.getConstant(BitLen, DL, MVT::i8),
10390 DAG.getConstant(BitIdx, DL, MVT::i8));
10392 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
10393 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
10394 V2 ? V2 : DAG.getUNDEF(VT),
10395 DAG.getConstant(BitLen, DL, MVT::i8),
10396 DAG.getConstant(BitIdx, DL, MVT::i8));
10401 /// Lower a vector shuffle as a zero or any extension.
10403 /// Given a specific number of elements, element bit width, and extension
10404 /// stride, produce either a zero or any extension based on the available
10405 /// features of the subtarget. The extended elements are consecutive and
10406 /// begin and can start from an offsetted element index in the input; to
10407 /// avoid excess shuffling the offset must either being in the bottom lane
10408 /// or at the start of a higher lane. All extended elements must be from
10410 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10411 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
10412 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10413 assert(Scale > 1 && "Need a scale to extend.");
10414 int EltBits = VT.getScalarSizeInBits();
10415 int NumElements = VT.getVectorNumElements();
10416 int NumEltsPerLane = 128 / EltBits;
10417 int OffsetLane = Offset / NumEltsPerLane;
10418 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
10419 "Only 8, 16, and 32 bit elements can be extended.");
10420 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
10421 assert(0 <= Offset && "Extension offset must be positive.");
10422 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
10423 "Extension offset must be in the first lane or start an upper lane.");
10425 // Check that an index is in same lane as the base offset.
10426 auto SafeOffset = [&](int Idx) {
10427 return OffsetLane == (Idx / NumEltsPerLane);
10430 // Shift along an input so that the offset base moves to the first element.
10431 auto ShuffleOffset = [&](SDValue V) {
10435 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10436 for (int i = 0; i * Scale < NumElements; ++i) {
10437 int SrcIdx = i + Offset;
10438 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
10440 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
10443 // Found a valid zext mask! Try various lowering strategies based on the
10444 // input type and available ISA extensions.
10445 if (Subtarget.hasSSE41()) {
10446 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
10447 // PUNPCK will catch this in a later shuffle match.
10448 if (Offset && Scale == 2 && VT.is128BitVector())
10450 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
10451 NumElements / Scale);
10452 InputV = ShuffleOffset(InputV);
10453 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
10454 return DAG.getBitcast(VT, InputV);
10457 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
10459 // For any extends we can cheat for larger element sizes and use shuffle
10460 // instructions that can fold with a load and/or copy.
10461 if (AnyExt && EltBits == 32) {
10462 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
10464 return DAG.getBitcast(
10465 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10466 DAG.getBitcast(MVT::v4i32, InputV),
10467 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10469 if (AnyExt && EltBits == 16 && Scale > 2) {
10470 int PSHUFDMask[4] = {Offset / 2, -1,
10471 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
10472 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10473 DAG.getBitcast(MVT::v4i32, InputV),
10474 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10475 int PSHUFWMask[4] = {1, -1, -1, -1};
10476 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
10477 return DAG.getBitcast(
10478 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
10479 DAG.getBitcast(MVT::v8i16, InputV),
10480 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
10483 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
10485 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
10486 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
10487 assert(VT.is128BitVector() && "Unexpected vector width!");
10489 int LoIdx = Offset * EltBits;
10490 SDValue Lo = DAG.getBitcast(
10491 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10492 DAG.getConstant(EltBits, DL, MVT::i8),
10493 DAG.getConstant(LoIdx, DL, MVT::i8)));
10495 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
10496 !SafeOffset(Offset + 1))
10497 return DAG.getBitcast(VT, Lo);
10499 int HiIdx = (Offset + 1) * EltBits;
10500 SDValue Hi = DAG.getBitcast(
10501 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10502 DAG.getConstant(EltBits, DL, MVT::i8),
10503 DAG.getConstant(HiIdx, DL, MVT::i8)));
10504 return DAG.getBitcast(VT,
10505 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
10508 // If this would require more than 2 unpack instructions to expand, use
10509 // pshufb when available. We can only use more than 2 unpack instructions
10510 // when zero extending i8 elements which also makes it easier to use pshufb.
10511 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
10512 assert(NumElements == 16 && "Unexpected byte vector width!");
10513 SDValue PSHUFBMask[16];
10514 for (int i = 0; i < 16; ++i) {
10515 int Idx = Offset + (i / Scale);
10516 PSHUFBMask[i] = DAG.getConstant(
10517 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
10519 InputV = DAG.getBitcast(MVT::v16i8, InputV);
10520 return DAG.getBitcast(
10521 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
10522 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
10525 // If we are extending from an offset, ensure we start on a boundary that
10526 // we can unpack from.
10527 int AlignToUnpack = Offset % (NumElements / Scale);
10528 if (AlignToUnpack) {
10529 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10530 for (int i = AlignToUnpack; i < NumElements; ++i)
10531 ShMask[i - AlignToUnpack] = i;
10532 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
10533 Offset -= AlignToUnpack;
10536 // Otherwise emit a sequence of unpacks.
10538 unsigned UnpackLoHi = X86ISD::UNPCKL;
10539 if (Offset >= (NumElements / 2)) {
10540 UnpackLoHi = X86ISD::UNPCKH;
10541 Offset -= (NumElements / 2);
10544 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
10545 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
10546 : getZeroVector(InputVT, Subtarget, DAG, DL);
10547 InputV = DAG.getBitcast(InputVT, InputV);
10548 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
10552 } while (Scale > 1);
10553 return DAG.getBitcast(VT, InputV);
10556 /// Try to lower a vector shuffle as a zero extension on any microarch.
10558 /// This routine will try to do everything in its power to cleverly lower
10559 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10560 /// check for the profitability of this lowering, it tries to aggressively
10561 /// match this pattern. It will use all of the micro-architectural details it
10562 /// can to emit an efficient lowering. It handles both blends with all-zero
10563 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10564 /// masking out later).
10566 /// The reason we have dedicated lowering for zext-style shuffles is that they
10567 /// are both incredibly common and often quite performance sensitive.
10568 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10569 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10570 const APInt &Zeroable, const X86Subtarget &Subtarget,
10571 SelectionDAG &DAG) {
10572 int Bits = VT.getSizeInBits();
10573 int NumLanes = Bits / 128;
10574 int NumElements = VT.getVectorNumElements();
10575 int NumEltsPerLane = NumElements / NumLanes;
10576 assert(VT.getScalarSizeInBits() <= 32 &&
10577 "Exceeds 32-bit integer zero extension limit");
10578 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10580 // Define a helper function to check a particular ext-scale and lower to it if
10582 auto Lower = [&](int Scale) -> SDValue {
10584 bool AnyExt = true;
10587 for (int i = 0; i < NumElements; ++i) {
10590 continue; // Valid anywhere but doesn't tell us anything.
10591 if (i % Scale != 0) {
10592 // Each of the extended elements need to be zeroable.
10596 // We no longer are in the anyext case.
10601 // Each of the base elements needs to be consecutive indices into the
10602 // same input vector.
10603 SDValue V = M < NumElements ? V1 : V2;
10604 M = M % NumElements;
10607 Offset = M - (i / Scale);
10608 } else if (InputV != V)
10609 return SDValue(); // Flip-flopping inputs.
10611 // Offset must start in the lowest 128-bit lane or at the start of an
10613 // FIXME: Is it ever worth allowing a negative base offset?
10614 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10615 (Offset % NumEltsPerLane) == 0))
10618 // If we are offsetting, all referenced entries must come from the same
10620 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10623 if ((M % NumElements) != (Offset + (i / Scale)))
10624 return SDValue(); // Non-consecutive strided elements.
10628 // If we fail to find an input, we have a zero-shuffle which should always
10629 // have already been handled.
10630 // FIXME: Maybe handle this here in case during blending we end up with one?
10634 // If we are offsetting, don't extend if we only match a single input, we
10635 // can always do better by using a basic PSHUF or PUNPCK.
10636 if (Offset != 0 && Matches < 2)
10639 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10640 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10643 // The widest scale possible for extending is to a 64-bit integer.
10644 assert(Bits % 64 == 0 &&
10645 "The number of bits in a vector must be divisible by 64 on x86!");
10646 int NumExtElements = Bits / 64;
10648 // Each iteration, try extending the elements half as much, but into twice as
10650 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10651 assert(NumElements % NumExtElements == 0 &&
10652 "The input vector size must be divisible by the extended size.");
10653 if (SDValue V = Lower(NumElements / NumExtElements))
10657 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10661 // Returns one of the source operands if the shuffle can be reduced to a
10662 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10663 auto CanZExtLowHalf = [&]() {
10664 for (int i = NumElements / 2; i != NumElements; ++i)
10667 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10669 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10674 if (SDValue V = CanZExtLowHalf()) {
10675 V = DAG.getBitcast(MVT::v2i64, V);
10676 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10677 return DAG.getBitcast(VT, V);
10680 // No viable ext lowering found.
10684 /// Try to get a scalar value for a specific element of a vector.
10686 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10687 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10688 SelectionDAG &DAG) {
10689 MVT VT = V.getSimpleValueType();
10690 MVT EltVT = VT.getVectorElementType();
10691 V = peekThroughBitcasts(V);
10693 // If the bitcasts shift the element size, we can't extract an equivalent
10694 // element from it.
10695 MVT NewVT = V.getSimpleValueType();
10696 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10699 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10700 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10701 // Ensure the scalar operand is the same size as the destination.
10702 // FIXME: Add support for scalar truncation where possible.
10703 SDValue S = V.getOperand(Idx);
10704 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10705 return DAG.getBitcast(EltVT, S);
10711 /// Helper to test for a load that can be folded with x86 shuffles.
10713 /// This is particularly important because the set of instructions varies
10714 /// significantly based on whether the operand is a load or not.
10715 static bool isShuffleFoldableLoad(SDValue V) {
10716 V = peekThroughBitcasts(V);
10717 return ISD::isNON_EXTLoad(V.getNode());
10720 /// Try to lower insertion of a single element into a zero vector.
10722 /// This is a common pattern that we have especially efficient patterns to lower
10723 /// across all subtarget feature sets.
10724 static SDValue lowerVectorShuffleAsElementInsertion(
10725 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10726 const APInt &Zeroable, const X86Subtarget &Subtarget,
10727 SelectionDAG &DAG) {
10729 MVT EltVT = VT.getVectorElementType();
10732 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10734 bool IsV1Zeroable = true;
10735 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10736 if (i != V2Index && !Zeroable[i]) {
10737 IsV1Zeroable = false;
10741 // Check for a single input from a SCALAR_TO_VECTOR node.
10742 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10743 // all the smarts here sunk into that routine. However, the current
10744 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10745 // vector shuffle lowering is dead.
10746 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10748 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10749 // We need to zext the scalar if it is smaller than an i32.
10750 V2S = DAG.getBitcast(EltVT, V2S);
10751 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10752 // Using zext to expand a narrow element won't work for non-zero
10757 // Zero-extend directly to i32.
10758 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10759 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10761 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10762 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10763 EltVT == MVT::i16) {
10764 // Either not inserting from the low element of the input or the input
10765 // element size is too small to use VZEXT_MOVL to clear the high bits.
10769 if (!IsV1Zeroable) {
10770 // If V1 can't be treated as a zero vector we have fewer options to lower
10771 // this. We can't support integer vectors or non-zero targets cheaply, and
10772 // the V1 elements can't be permuted in any way.
10773 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10774 if (!VT.isFloatingPoint() || V2Index != 0)
10776 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10777 V1Mask[V2Index] = -1;
10778 if (!isNoopShuffleMask(V1Mask))
10780 if (!VT.is128BitVector())
10783 // Otherwise, use MOVSD or MOVSS.
10784 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10785 "Only two types of floating point element types to handle!");
10786 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10790 // This lowering only works for the low element with floating point vectors.
10791 if (VT.isFloatingPoint() && V2Index != 0)
10794 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10796 V2 = DAG.getBitcast(VT, V2);
10798 if (V2Index != 0) {
10799 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10800 // the desired position. Otherwise it is more efficient to do a vector
10801 // shift left. We know that we can do a vector shift left because all
10802 // the inputs are zero.
10803 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10804 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10805 V2Shuffle[V2Index] = 0;
10806 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10808 V2 = DAG.getBitcast(MVT::v16i8, V2);
10810 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10811 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
10812 V2 = DAG.getBitcast(VT, V2);
10818 /// Try to lower broadcast of a single - truncated - integer element,
10819 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10821 /// This assumes we have AVX2.
10822 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10823 SDValue V0, int BroadcastIdx,
10824 const X86Subtarget &Subtarget,
10825 SelectionDAG &DAG) {
10826 assert(Subtarget.hasAVX2() &&
10827 "We can only lower integer broadcasts with AVX2!");
10829 EVT EltVT = VT.getVectorElementType();
10830 EVT V0VT = V0.getValueType();
10832 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10833 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10835 EVT V0EltVT = V0VT.getVectorElementType();
10836 if (!V0EltVT.isInteger())
10839 const unsigned EltSize = EltVT.getSizeInBits();
10840 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10842 // This is only a truncation if the original element type is larger.
10843 if (V0EltSize <= EltSize)
10846 assert(((V0EltSize % EltSize) == 0) &&
10847 "Scalar type sizes must all be powers of 2 on x86!");
10849 const unsigned V0Opc = V0.getOpcode();
10850 const unsigned Scale = V0EltSize / EltSize;
10851 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10853 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10854 V0Opc != ISD::BUILD_VECTOR)
10857 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10859 // If we're extracting non-least-significant bits, shift so we can truncate.
10860 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10861 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10862 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10863 if (const int OffsetIdx = BroadcastIdx % Scale)
10864 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10865 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
10867 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10868 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10871 /// Try to lower broadcast of a single element.
10873 /// For convenience, this code also bundles all of the subtarget feature set
10874 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10875 /// a convenient way to factor it out.
10876 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10877 SDValue V1, SDValue V2,
10878 ArrayRef<int> Mask,
10879 const X86Subtarget &Subtarget,
10880 SelectionDAG &DAG) {
10881 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10882 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10883 (Subtarget.hasAVX2() && VT.isInteger())))
10886 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10887 // we can only broadcast from a register with AVX2.
10888 unsigned NumElts = Mask.size();
10889 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10891 : X86ISD::VBROADCAST;
10892 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10894 // Check that the mask is a broadcast.
10895 int BroadcastIdx = -1;
10896 for (int i = 0; i != (int)NumElts; ++i) {
10897 SmallVector<int, 8> BroadcastMask(NumElts, i);
10898 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10904 if (BroadcastIdx < 0)
10906 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10907 "a sorted mask where the broadcast "
10910 // Go up the chain of (vector) values to find a scalar load that we can
10911 // combine with the broadcast.
10914 switch (V.getOpcode()) {
10915 case ISD::BITCAST: {
10916 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10917 SDValue VSrc = V.getOperand(0);
10918 unsigned NumEltBits = V.getScalarValueSizeInBits();
10919 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10920 if ((NumEltBits % NumSrcBits) == 0)
10921 BroadcastIdx *= (NumEltBits / NumSrcBits);
10922 else if ((NumSrcBits % NumEltBits) == 0 &&
10923 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10924 BroadcastIdx /= (NumSrcBits / NumEltBits);
10930 case ISD::CONCAT_VECTORS: {
10931 int OperandSize = Mask.size() / V.getNumOperands();
10932 V = V.getOperand(BroadcastIdx / OperandSize);
10933 BroadcastIdx %= OperandSize;
10936 case ISD::INSERT_SUBVECTOR: {
10937 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10938 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10942 int BeginIdx = (int)ConstantIdx->getZExtValue();
10944 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10945 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10946 BroadcastIdx -= BeginIdx;
10957 // Ensure the source vector and BroadcastIdx are for a suitable type.
10958 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10959 unsigned NumEltBits = VT.getScalarSizeInBits();
10960 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10961 if ((NumSrcBits % NumEltBits) == 0)
10962 BroadcastIdx *= (NumSrcBits / NumEltBits);
10963 else if ((NumEltBits % NumSrcBits) == 0 &&
10964 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10965 BroadcastIdx /= (NumEltBits / NumSrcBits);
10969 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10970 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10971 V = DAG.getBitcast(SrcVT, V);
10974 // Check if this is a broadcast of a scalar. We special case lowering
10975 // for scalars so that we can more effectively fold with loads.
10976 // First, look through bitcast: if the original value has a larger element
10977 // type than the shuffle, the broadcast element is in essence truncated.
10978 // Make that explicit to ease folding.
10979 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10980 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10981 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10982 return TruncBroadcast;
10984 MVT BroadcastVT = VT;
10986 // Peek through any bitcast (only useful for loads).
10987 SDValue BC = peekThroughBitcasts(V);
10989 // Also check the simpler case, where we can directly reuse the scalar.
10990 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10991 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10992 V = V.getOperand(BroadcastIdx);
10994 // If we can't broadcast from a register, check that the input is a load.
10995 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10997 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10998 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10999 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
11000 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
11001 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
11006 // If we are broadcasting a load that is only used by the shuffle
11007 // then we can reduce the vector load to the broadcasted scalar load.
11008 LoadSDNode *Ld = cast<LoadSDNode>(BC);
11009 SDValue BaseAddr = Ld->getOperand(1);
11010 EVT SVT = BroadcastVT.getScalarType();
11011 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
11012 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
11013 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
11014 DAG.getMachineFunction().getMachineMemOperand(
11015 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
11016 DAG.makeEquivalentMemoryOrdering(Ld, V);
11017 } else if (!BroadcastFromReg) {
11018 // We can't broadcast from a vector register.
11020 } else if (BroadcastIdx != 0) {
11021 // We can only broadcast from the zero-element of a vector register,
11022 // but it can be advantageous to broadcast from the zero-element of a
11024 if (!VT.is256BitVector() && !VT.is512BitVector())
11027 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
11028 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11031 // Only broadcast the zero-element of a 128-bit subvector.
11032 unsigned EltSize = VT.getScalarSizeInBits();
11033 if (((BroadcastIdx * EltSize) % 128) != 0)
11036 // The shuffle input might have been a bitcast we looked through; look at
11037 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
11038 // later bitcast it to BroadcastVT.
11039 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11040 "Unexpected vector element size");
11041 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
11042 "Unexpected vector size");
11043 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
11046 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
11047 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
11048 DAG.getBitcast(MVT::f64, V));
11050 // Bitcast back to the same scalar type as BroadcastVT.
11051 MVT SrcVT = V.getSimpleValueType();
11052 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
11053 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11054 "Unexpected vector element size");
11055 if (SrcVT.isVector()) {
11056 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11057 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
11059 SrcVT = BroadcastVT.getScalarType();
11061 V = DAG.getBitcast(SrcVT, V);
11064 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
11065 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
11066 V = DAG.getBitcast(MVT::f64, V);
11067 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
11068 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
11071 // We only support broadcasting from 128-bit vectors to minimize the
11072 // number of patterns we need to deal with in isel. So extract down to
11073 // 128-bits, removing as many bitcasts as possible.
11074 if (SrcVT.getSizeInBits() > 128) {
11075 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
11076 128 / SrcVT.getScalarSizeInBits());
11077 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
11078 V = DAG.getBitcast(ExtVT, V);
11081 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
11084 // Check for whether we can use INSERTPS to perform the shuffle. We only use
11085 // INSERTPS when the V1 elements are already in the correct locations
11086 // because otherwise we can just always use two SHUFPS instructions which
11087 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
11088 // perform INSERTPS if a single V1 element is out of place and all V2
11089 // elements are zeroable.
11090 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
11091 unsigned &InsertPSMask,
11092 const APInt &Zeroable,
11093 ArrayRef<int> Mask,
11094 SelectionDAG &DAG) {
11095 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
11096 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
11097 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11099 // Attempt to match INSERTPS with one element from VA or VB being
11100 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
11102 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
11103 ArrayRef<int> CandidateMask) {
11104 unsigned ZMask = 0;
11105 int VADstIndex = -1;
11106 int VBDstIndex = -1;
11107 bool VAUsedInPlace = false;
11109 for (int i = 0; i < 4; ++i) {
11110 // Synthesize a zero mask from the zeroable elements (includes undefs).
11116 // Flag if we use any VA inputs in place.
11117 if (i == CandidateMask[i]) {
11118 VAUsedInPlace = true;
11122 // We can only insert a single non-zeroable element.
11123 if (VADstIndex >= 0 || VBDstIndex >= 0)
11126 if (CandidateMask[i] < 4) {
11127 // VA input out of place for insertion.
11130 // VB input for insertion.
11135 // Don't bother if we have no (non-zeroable) element for insertion.
11136 if (VADstIndex < 0 && VBDstIndex < 0)
11139 // Determine element insertion src/dst indices. The src index is from the
11140 // start of the inserted vector, not the start of the concatenated vector.
11141 unsigned VBSrcIndex = 0;
11142 if (VADstIndex >= 0) {
11143 // If we have a VA input out of place, we use VA as the V2 element
11144 // insertion and don't use the original V2 at all.
11145 VBSrcIndex = CandidateMask[VADstIndex];
11146 VBDstIndex = VADstIndex;
11149 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
11152 // If no V1 inputs are used in place, then the result is created only from
11153 // the zero mask and the V2 insertion - so remove V1 dependency.
11154 if (!VAUsedInPlace)
11155 VA = DAG.getUNDEF(MVT::v4f32);
11157 // Update V1, V2 and InsertPSMask accordingly.
11161 // Insert the V2 element into the desired position.
11162 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
11163 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
11167 if (matchAsInsertPS(V1, V2, Mask))
11170 // Commute and try again.
11171 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11172 ShuffleVectorSDNode::commuteMask(CommutedMask);
11173 if (matchAsInsertPS(V2, V1, CommutedMask))
11179 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
11180 SDValue V2, ArrayRef<int> Mask,
11181 const APInt &Zeroable,
11182 SelectionDAG &DAG) {
11183 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11184 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11186 // Attempt to match the insertps pattern.
11187 unsigned InsertPSMask;
11188 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
11191 // Insert the V2 element into the desired position.
11192 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
11193 DAG.getConstant(InsertPSMask, DL, MVT::i8));
11196 /// Try to lower a shuffle as a permute of the inputs followed by an
11197 /// UNPCK instruction.
11199 /// This specifically targets cases where we end up with alternating between
11200 /// the two inputs, and so can permute them into something that feeds a single
11201 /// UNPCK instruction. Note that this routine only targets integer vectors
11202 /// because for floating point vectors we have a generalized SHUFPS lowering
11203 /// strategy that handles everything that doesn't *exactly* match an unpack,
11204 /// making this clever lowering unnecessary.
11205 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
11206 SDValue V1, SDValue V2,
11207 ArrayRef<int> Mask,
11208 SelectionDAG &DAG) {
11209 assert(!VT.isFloatingPoint() &&
11210 "This routine only supports integer vectors.");
11211 assert(VT.is128BitVector() &&
11212 "This routine only works on 128-bit vectors.");
11213 assert(!V2.isUndef() &&
11214 "This routine should only be used when blending two inputs.");
11215 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11217 int Size = Mask.size();
11220 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11222 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11224 bool UnpackLo = NumLoInputs >= NumHiInputs;
11226 auto TryUnpack = [&](int ScalarSize, int Scale) {
11227 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11228 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11230 for (int i = 0; i < Size; ++i) {
11234 // Each element of the unpack contains Scale elements from this mask.
11235 int UnpackIdx = i / Scale;
11237 // We only handle the case where V1 feeds the first slots of the unpack.
11238 // We rely on canonicalization to ensure this is the case.
11239 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11242 // Setup the mask for this input. The indexing is tricky as we have to
11243 // handle the unpack stride.
11244 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11245 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11249 // If we will have to shuffle both inputs to use the unpack, check whether
11250 // we can just unpack first and shuffle the result. If so, skip this unpack.
11251 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11252 !isNoopShuffleMask(V2Mask))
11255 // Shuffle the inputs into place.
11256 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11257 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11259 // Cast the inputs to the type we will use to unpack them.
11260 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11261 V1 = DAG.getBitcast(UnpackVT, V1);
11262 V2 = DAG.getBitcast(UnpackVT, V2);
11264 // Unpack the inputs and cast the result back to the desired type.
11265 return DAG.getBitcast(
11266 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11267 UnpackVT, V1, V2));
11270 // We try each unpack from the largest to the smallest to try and find one
11271 // that fits this mask.
11272 int OrigScalarSize = VT.getScalarSizeInBits();
11273 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11274 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11277 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11279 if (NumLoInputs == 0 || NumHiInputs == 0) {
11280 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11281 "We have to have *some* inputs!");
11282 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11284 // FIXME: We could consider the total complexity of the permute of each
11285 // possible unpacking. Or at the least we should consider how many
11286 // half-crossings are created.
11287 // FIXME: We could consider commuting the unpacks.
11289 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11290 for (int i = 0; i < Size; ++i) {
11294 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11297 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11299 return DAG.getVectorShuffle(
11300 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
11302 DAG.getUNDEF(VT), PermMask);
11308 /// Handle lowering of 2-lane 64-bit floating point shuffles.
11310 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
11311 /// support for floating point shuffles but not integer shuffles. These
11312 /// instructions will incur a domain crossing penalty on some chips though so
11313 /// it is better to avoid lowering through this for integer vectors where
11315 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11316 const APInt &Zeroable,
11317 SDValue V1, SDValue V2,
11318 const X86Subtarget &Subtarget,
11319 SelectionDAG &DAG) {
11320 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11321 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11322 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11324 if (V2.isUndef()) {
11325 // Check for being able to broadcast a single element.
11326 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11327 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
11330 // Straight shuffle of a single input vector. Simulate this by using the
11331 // single input as both of the "inputs" to this instruction..
11332 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
11334 if (Subtarget.hasAVX()) {
11335 // If we have AVX, we can use VPERMILPS which will allow folding a load
11336 // into the shuffle.
11337 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
11338 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11341 return DAG.getNode(
11342 X86ISD::SHUFP, DL, MVT::v2f64,
11343 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11344 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11345 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11347 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
11348 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
11349 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11350 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11352 // When loading a scalar and then shuffling it into a vector we can often do
11353 // the insertion cheaply.
11354 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11355 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11357 // Try inverting the insertion since for v2 masks it is easy to do and we
11358 // can't reliably sort the mask one way or the other.
11359 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
11360 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
11361 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11362 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11365 // Try to use one of the special instruction patterns to handle two common
11366 // blend patterns if a zero-blend above didn't work.
11367 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
11368 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
11369 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
11370 // We can either use a special instruction to load over the low double or
11371 // to move just the low double.
11372 return DAG.getNode(
11373 X86ISD::MOVSD, DL, MVT::v2f64, V2,
11374 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
11376 if (Subtarget.hasSSE41())
11377 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
11378 Zeroable, Subtarget, DAG))
11381 // Use dedicated unpack instructions for masks that match their pattern.
11383 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
11386 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
11387 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
11388 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11391 /// Handle lowering of 2-lane 64-bit integer shuffles.
11393 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
11394 /// the integer unit to minimize domain crossing penalties. However, for blends
11395 /// it falls back to the floating point shuffle operation with appropriate bit
11397 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11398 const APInt &Zeroable,
11399 SDValue V1, SDValue V2,
11400 const X86Subtarget &Subtarget,
11401 SelectionDAG &DAG) {
11402 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11403 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11404 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11406 if (V2.isUndef()) {
11407 // Check for being able to broadcast a single element.
11408 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11409 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11412 // Straight shuffle of a single input vector. For everything from SSE2
11413 // onward this has a single fast instruction with no scary immediates.
11414 // We have to map the mask as it is actually a v4i32 shuffle instruction.
11415 V1 = DAG.getBitcast(MVT::v4i32, V1);
11416 int WidenedMask[4] = {
11417 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
11418 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
11419 return DAG.getBitcast(
11421 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11422 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
11424 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
11425 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
11426 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11427 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11429 // Try to use shift instructions.
11430 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
11431 Zeroable, Subtarget, DAG))
11434 // When loading a scalar and then shuffling it into a vector we can often do
11435 // the insertion cheaply.
11436 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11437 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11439 // Try inverting the insertion since for v2 masks it is easy to do and we
11440 // can't reliably sort the mask one way or the other.
11441 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
11442 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11443 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11446 // We have different paths for blend lowering, but they all must use the
11447 // *exact* same predicate.
11448 bool IsBlendSupported = Subtarget.hasSSE41();
11449 if (IsBlendSupported)
11450 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
11451 Zeroable, Subtarget, DAG))
11454 // Use dedicated unpack instructions for masks that match their pattern.
11456 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
11459 // Try to use byte rotation instructions.
11460 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11461 if (Subtarget.hasSSSE3()) {
11462 if (Subtarget.hasVLX())
11463 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
11464 Mask, Subtarget, DAG))
11467 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11468 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11472 // If we have direct support for blends, we should lower by decomposing into
11473 // a permute. That will be faster than the domain cross.
11474 if (IsBlendSupported)
11475 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
11478 // We implement this with SHUFPD which is pretty lame because it will likely
11479 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
11480 // However, all the alternatives are still more cycles and newer chips don't
11481 // have this problem. It would be really nice if x86 had better shuffles here.
11482 V1 = DAG.getBitcast(MVT::v2f64, V1);
11483 V2 = DAG.getBitcast(MVT::v2f64, V2);
11484 return DAG.getBitcast(MVT::v2i64,
11485 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
11488 /// Test whether this can be lowered with a single SHUFPS instruction.
11490 /// This is used to disable more specialized lowerings when the shufps lowering
11491 /// will happen to be efficient.
11492 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
11493 // This routine only handles 128-bit shufps.
11494 assert(Mask.size() == 4 && "Unsupported mask size!");
11495 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
11496 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
11497 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
11498 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
11500 // To lower with a single SHUFPS we need to have the low half and high half
11501 // each requiring a single input.
11502 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
11504 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
11510 /// Lower a vector shuffle using the SHUFPS instruction.
11512 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
11513 /// It makes no assumptions about whether this is the *best* lowering, it simply
11515 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
11516 ArrayRef<int> Mask, SDValue V1,
11517 SDValue V2, SelectionDAG &DAG) {
11518 SDValue LowV = V1, HighV = V2;
11519 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
11521 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11523 if (NumV2Elements == 1) {
11524 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
11526 // Compute the index adjacent to V2Index and in the same half by toggling
11528 int V2AdjIndex = V2Index ^ 1;
11530 if (Mask[V2AdjIndex] < 0) {
11531 // Handles all the cases where we have a single V2 element and an undef.
11532 // This will only ever happen in the high lanes because we commute the
11533 // vector otherwise.
11535 std::swap(LowV, HighV);
11536 NewMask[V2Index] -= 4;
11538 // Handle the case where the V2 element ends up adjacent to a V1 element.
11539 // To make this work, blend them together as the first step.
11540 int V1Index = V2AdjIndex;
11541 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
11542 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11543 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11545 // Now proceed to reconstruct the final blend as we have the necessary
11546 // high or low half formed.
11553 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
11554 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
11556 } else if (NumV2Elements == 2) {
11557 if (Mask[0] < 4 && Mask[1] < 4) {
11558 // Handle the easy case where we have V1 in the low lanes and V2 in the
11562 } else if (Mask[2] < 4 && Mask[3] < 4) {
11563 // We also handle the reversed case because this utility may get called
11564 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11565 // arrange things in the right direction.
11571 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11572 // trying to place elements directly, just blend them and set up the final
11573 // shuffle to place them.
11575 // The first two blend mask elements are for V1, the second two are for
11577 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11578 Mask[2] < 4 ? Mask[2] : Mask[3],
11579 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11580 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11581 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11582 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11584 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11587 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11588 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11589 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11590 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11593 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11594 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11597 /// Lower 4-lane 32-bit floating point shuffles.
11599 /// Uses instructions exclusively from the floating point unit to minimize
11600 /// domain crossing penalties, as these are sufficient to implement all v4f32
11602 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11603 const APInt &Zeroable,
11604 SDValue V1, SDValue V2,
11605 const X86Subtarget &Subtarget,
11606 SelectionDAG &DAG) {
11607 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11608 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11609 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11611 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11613 if (NumV2Elements == 0) {
11614 // Check for being able to broadcast a single element.
11615 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11616 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11619 // Use even/odd duplicate instructions for masks that match their pattern.
11620 if (Subtarget.hasSSE3()) {
11621 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11622 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11623 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11624 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11627 if (Subtarget.hasAVX()) {
11628 // If we have AVX, we can use VPERMILPS which will allow folding a load
11629 // into the shuffle.
11630 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11631 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11634 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11635 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11636 if (!Subtarget.hasSSE2()) {
11637 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11638 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11639 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11640 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11643 // Otherwise, use a straight shuffle of a single input vector. We pass the
11644 // input vector to both operands to simulate this with a SHUFPS.
11645 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11646 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11649 // There are special ways we can lower some single-element blends. However, we
11650 // have custom ways we can lower more complex single-element blends below that
11651 // we defer to if both this and BLENDPS fail to match, so restrict this to
11652 // when the V2 input is targeting element 0 of the mask -- that is the fast
11654 if (NumV2Elements == 1 && Mask[0] >= 4)
11655 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11656 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11659 if (Subtarget.hasSSE41()) {
11660 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11661 Zeroable, Subtarget, DAG))
11664 // Use INSERTPS if we can complete the shuffle efficiently.
11666 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11669 if (!isSingleSHUFPSMask(Mask))
11670 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11671 DL, MVT::v4f32, V1, V2, Mask, DAG))
11675 // Use low/high mov instructions. These are only valid in SSE1 because
11676 // otherwise they are widened to v2f64 and never get here.
11677 if (!Subtarget.hasSSE2()) {
11678 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11679 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11680 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11681 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11684 // Use dedicated unpack instructions for masks that match their pattern.
11686 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11689 // Otherwise fall back to a SHUFPS lowering strategy.
11690 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11693 /// Lower 4-lane i32 vector shuffles.
11695 /// We try to handle these with integer-domain shuffles where we can, but for
11696 /// blends we use the floating point domain blend instructions.
11697 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11698 const APInt &Zeroable,
11699 SDValue V1, SDValue V2,
11700 const X86Subtarget &Subtarget,
11701 SelectionDAG &DAG) {
11702 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11703 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11704 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11706 // Whenever we can lower this as a zext, that instruction is strictly faster
11707 // than any alternative. It also allows us to fold memory operands into the
11708 // shuffle in many cases.
11709 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11710 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11713 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11715 if (NumV2Elements == 0) {
11716 // Check for being able to broadcast a single element.
11717 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11718 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11721 // Straight shuffle of a single input vector. For everything from SSE2
11722 // onward this has a single fast instruction with no scary immediates.
11723 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11724 // but we aren't actually going to use the UNPCK instruction because doing
11725 // so prevents folding a load into this instruction or making a copy.
11726 const int UnpackLoMask[] = {0, 0, 1, 1};
11727 const int UnpackHiMask[] = {2, 2, 3, 3};
11728 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11729 Mask = UnpackLoMask;
11730 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11731 Mask = UnpackHiMask;
11733 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11734 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11737 // Try to use shift instructions.
11738 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11739 Zeroable, Subtarget, DAG))
11742 // There are special ways we can lower some single-element blends.
11743 if (NumV2Elements == 1)
11744 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11745 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11748 // We have different paths for blend lowering, but they all must use the
11749 // *exact* same predicate.
11750 bool IsBlendSupported = Subtarget.hasSSE41();
11751 if (IsBlendSupported)
11752 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11753 Zeroable, Subtarget, DAG))
11756 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11760 // Use dedicated unpack instructions for masks that match their pattern.
11762 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11765 // Try to use byte rotation instructions.
11766 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11767 if (Subtarget.hasSSSE3()) {
11768 if (Subtarget.hasVLX())
11769 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11770 Mask, Subtarget, DAG))
11773 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11774 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11778 // Assume that a single SHUFPS is faster than an alternative sequence of
11779 // multiple instructions (even if the CPU has a domain penalty).
11780 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11781 if (!isSingleSHUFPSMask(Mask)) {
11782 // If we have direct support for blends, we should lower by decomposing into
11783 // a permute. That will be faster than the domain cross.
11784 if (IsBlendSupported)
11785 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11788 // Try to lower by permuting the inputs into an unpack instruction.
11789 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11790 DL, MVT::v4i32, V1, V2, Mask, DAG))
11794 // We implement this with SHUFPS because it can blend from two vectors.
11795 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11796 // up the inputs, bypassing domain shift penalties that we would incur if we
11797 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11799 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11800 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11801 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11802 return DAG.getBitcast(MVT::v4i32, ShufPS);
11805 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11806 /// shuffle lowering, and the most complex part.
11808 /// The lowering strategy is to try to form pairs of input lanes which are
11809 /// targeted at the same half of the final vector, and then use a dword shuffle
11810 /// to place them onto the right half, and finally unpack the paired lanes into
11811 /// their final position.
11813 /// The exact breakdown of how to form these dword pairs and align them on the
11814 /// correct sides is really tricky. See the comments within the function for
11815 /// more of the details.
11817 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11818 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11819 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11820 /// vector, form the analogous 128-bit 8-element Mask.
11821 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11822 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11823 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11824 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11825 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11827 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11828 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11829 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11831 // Attempt to directly match PSHUFLW or PSHUFHW.
11832 if (isUndefOrInRange(LoMask, 0, 4) &&
11833 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
11834 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11835 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11837 if (isUndefOrInRange(HiMask, 4, 8) &&
11838 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
11839 for (int i = 0; i != 4; ++i)
11840 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
11841 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11842 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11845 SmallVector<int, 4> LoInputs;
11846 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11847 array_pod_sort(LoInputs.begin(), LoInputs.end());
11848 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11849 SmallVector<int, 4> HiInputs;
11850 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11851 array_pod_sort(HiInputs.begin(), HiInputs.end());
11852 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11854 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11855 int NumHToL = LoInputs.size() - NumLToL;
11857 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11858 int NumHToH = HiInputs.size() - NumLToH;
11859 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11860 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11861 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11862 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11864 // If we are shuffling values from one half - check how many different DWORD
11865 // pairs we need to create. If only 1 or 2 then we can perform this as a
11866 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
11867 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
11868 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
11869 V = DAG.getNode(ShufWOp, DL, VT, V,
11870 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11871 V = DAG.getBitcast(PSHUFDVT, V);
11872 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11873 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11874 return DAG.getBitcast(VT, V);
11877 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
11878 int PSHUFDMask[4] = { -1, -1, -1, -1 };
11879 SmallVector<std::pair<int, int>, 4> DWordPairs;
11880 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
11882 // Collect the different DWORD pairs.
11883 for (int DWord = 0; DWord != 4; ++DWord) {
11884 int M0 = Mask[2 * DWord + 0];
11885 int M1 = Mask[2 * DWord + 1];
11886 M0 = (M0 >= 0 ? M0 % 4 : M0);
11887 M1 = (M1 >= 0 ? M1 % 4 : M1);
11888 if (M0 < 0 && M1 < 0)
11891 bool Match = false;
11892 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
11893 auto &DWordPair = DWordPairs[j];
11894 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
11895 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
11896 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
11897 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
11898 PSHUFDMask[DWord] = DOffset + j;
11904 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
11905 DWordPairs.push_back(std::make_pair(M0, M1));
11909 if (DWordPairs.size() <= 2) {
11910 DWordPairs.resize(2, std::make_pair(-1, -1));
11911 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
11912 DWordPairs[1].first, DWordPairs[1].second};
11913 if ((NumHToL + NumHToH) == 0)
11914 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
11915 if ((NumLToL + NumLToH) == 0)
11916 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
11920 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11921 // such inputs we can swap two of the dwords across the half mark and end up
11922 // with <=2 inputs to each half in each half. Once there, we can fall through
11923 // to the generic code below. For example:
11925 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11926 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11928 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11929 // and an existing 2-into-2 on the other half. In this case we may have to
11930 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11931 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11932 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11933 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11934 // half than the one we target for fixing) will be fixed when we re-enter this
11935 // path. We will also combine away any sequence of PSHUFD instructions that
11936 // result into a single instruction. Here is an example of the tricky case:
11938 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11939 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11941 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11943 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11944 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11946 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11947 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11949 // The result is fine to be handled by the generic logic.
11950 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11951 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11952 int AOffset, int BOffset) {
11953 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11954 "Must call this with A having 3 or 1 inputs from the A half.");
11955 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11956 "Must call this with B having 1 or 3 inputs from the B half.");
11957 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11958 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11960 bool ThreeAInputs = AToAInputs.size() == 3;
11962 // Compute the index of dword with only one word among the three inputs in
11963 // a half by taking the sum of the half with three inputs and subtracting
11964 // the sum of the actual three inputs. The difference is the remaining
11966 int ADWord, BDWord;
11967 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11968 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11969 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11970 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11971 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11972 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11973 int TripleNonInputIdx =
11974 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11975 TripleDWord = TripleNonInputIdx / 2;
11977 // We use xor with one to compute the adjacent DWord to whichever one the
11979 OneInputDWord = (OneInput / 2) ^ 1;
11981 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11982 // and BToA inputs. If there is also such a problem with the BToB and AToB
11983 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11984 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11985 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11986 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11987 // Compute how many inputs will be flipped by swapping these DWords. We
11989 // to balance this to ensure we don't form a 3-1 shuffle in the other
11991 int NumFlippedAToBInputs =
11992 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11993 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11994 int NumFlippedBToBInputs =
11995 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11996 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11997 if ((NumFlippedAToBInputs == 1 &&
11998 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11999 (NumFlippedBToBInputs == 1 &&
12000 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
12001 // We choose whether to fix the A half or B half based on whether that
12002 // half has zero flipped inputs. At zero, we may not be able to fix it
12003 // with that half. We also bias towards fixing the B half because that
12004 // will more commonly be the high half, and we have to bias one way.
12005 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
12006 ArrayRef<int> Inputs) {
12007 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
12008 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
12009 // Determine whether the free index is in the flipped dword or the
12010 // unflipped dword based on where the pinned index is. We use this bit
12011 // in an xor to conditionally select the adjacent dword.
12012 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
12013 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12014 if (IsFixIdxInput == IsFixFreeIdxInput)
12016 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12017 assert(IsFixIdxInput != IsFixFreeIdxInput &&
12018 "We need to be changing the number of flipped inputs!");
12019 int PSHUFHalfMask[] = {0, 1, 2, 3};
12020 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
12022 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
12023 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
12024 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
12026 for (int &M : Mask)
12027 if (M >= 0 && M == FixIdx)
12029 else if (M >= 0 && M == FixFreeIdx)
12032 if (NumFlippedBToBInputs != 0) {
12034 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
12035 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
12037 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
12038 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
12039 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
12044 int PSHUFDMask[] = {0, 1, 2, 3};
12045 PSHUFDMask[ADWord] = BDWord;
12046 PSHUFDMask[BDWord] = ADWord;
12047 V = DAG.getBitcast(
12049 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12050 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12052 // Adjust the mask to match the new locations of A and B.
12053 for (int &M : Mask)
12054 if (M >= 0 && M/2 == ADWord)
12055 M = 2 * BDWord + M % 2;
12056 else if (M >= 0 && M/2 == BDWord)
12057 M = 2 * ADWord + M % 2;
12059 // Recurse back into this routine to re-compute state now that this isn't
12060 // a 3 and 1 problem.
12061 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
12064 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
12065 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
12066 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
12067 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
12069 // At this point there are at most two inputs to the low and high halves from
12070 // each half. That means the inputs can always be grouped into dwords and
12071 // those dwords can then be moved to the correct half with a dword shuffle.
12072 // We use at most one low and one high word shuffle to collect these paired
12073 // inputs into dwords, and finally a dword shuffle to place them.
12074 int PSHUFLMask[4] = {-1, -1, -1, -1};
12075 int PSHUFHMask[4] = {-1, -1, -1, -1};
12076 int PSHUFDMask[4] = {-1, -1, -1, -1};
12078 // First fix the masks for all the inputs that are staying in their
12079 // original halves. This will then dictate the targets of the cross-half
12081 auto fixInPlaceInputs =
12082 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
12083 MutableArrayRef<int> SourceHalfMask,
12084 MutableArrayRef<int> HalfMask, int HalfOffset) {
12085 if (InPlaceInputs.empty())
12087 if (InPlaceInputs.size() == 1) {
12088 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12089 InPlaceInputs[0] - HalfOffset;
12090 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
12093 if (IncomingInputs.empty()) {
12094 // Just fix all of the in place inputs.
12095 for (int Input : InPlaceInputs) {
12096 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
12097 PSHUFDMask[Input / 2] = Input / 2;
12102 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
12103 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12104 InPlaceInputs[0] - HalfOffset;
12105 // Put the second input next to the first so that they are packed into
12106 // a dword. We find the adjacent index by toggling the low bit.
12107 int AdjIndex = InPlaceInputs[0] ^ 1;
12108 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
12109 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
12110 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
12112 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
12113 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
12115 // Now gather the cross-half inputs and place them into a free dword of
12116 // their target half.
12117 // FIXME: This operation could almost certainly be simplified dramatically to
12118 // look more like the 3-1 fixing operation.
12119 auto moveInputsToRightHalf = [&PSHUFDMask](
12120 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
12121 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
12122 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
12124 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
12125 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
12127 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
12129 int LowWord = Word & ~1;
12130 int HighWord = Word | 1;
12131 return isWordClobbered(SourceHalfMask, LowWord) ||
12132 isWordClobbered(SourceHalfMask, HighWord);
12135 if (IncomingInputs.empty())
12138 if (ExistingInputs.empty()) {
12139 // Map any dwords with inputs from them into the right half.
12140 for (int Input : IncomingInputs) {
12141 // If the source half mask maps over the inputs, turn those into
12142 // swaps and use the swapped lane.
12143 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
12144 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
12145 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
12146 Input - SourceOffset;
12147 // We have to swap the uses in our half mask in one sweep.
12148 for (int &M : HalfMask)
12149 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
12151 else if (M == Input)
12152 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12154 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
12155 Input - SourceOffset &&
12156 "Previous placement doesn't match!");
12158 // Note that this correctly re-maps both when we do a swap and when
12159 // we observe the other side of the swap above. We rely on that to
12160 // avoid swapping the members of the input list directly.
12161 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12164 // Map the input's dword into the correct half.
12165 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
12166 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
12168 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
12170 "Previous placement doesn't match!");
12173 // And just directly shift any other-half mask elements to be same-half
12174 // as we will have mirrored the dword containing the element into the
12175 // same position within that half.
12176 for (int &M : HalfMask)
12177 if (M >= SourceOffset && M < SourceOffset + 4) {
12178 M = M - SourceOffset + DestOffset;
12179 assert(M >= 0 && "This should never wrap below zero!");
12184 // Ensure we have the input in a viable dword of its current half. This
12185 // is particularly tricky because the original position may be clobbered
12186 // by inputs being moved and *staying* in that half.
12187 if (IncomingInputs.size() == 1) {
12188 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12189 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
12191 SourceHalfMask[InputFixed - SourceOffset] =
12192 IncomingInputs[0] - SourceOffset;
12193 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
12195 IncomingInputs[0] = InputFixed;
12197 } else if (IncomingInputs.size() == 2) {
12198 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
12199 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12200 // We have two non-adjacent or clobbered inputs we need to extract from
12201 // the source half. To do this, we need to map them into some adjacent
12202 // dword slot in the source mask.
12203 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
12204 IncomingInputs[1] - SourceOffset};
12206 // If there is a free slot in the source half mask adjacent to one of
12207 // the inputs, place the other input in it. We use (Index XOR 1) to
12208 // compute an adjacent index.
12209 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
12210 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
12211 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
12212 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12213 InputsFixed[1] = InputsFixed[0] ^ 1;
12214 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
12215 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
12216 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
12217 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
12218 InputsFixed[0] = InputsFixed[1] ^ 1;
12219 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
12220 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
12221 // The two inputs are in the same DWord but it is clobbered and the
12222 // adjacent DWord isn't used at all. Move both inputs to the free
12224 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
12225 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
12226 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
12227 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
12229 // The only way we hit this point is if there is no clobbering
12230 // (because there are no off-half inputs to this half) and there is no
12231 // free slot adjacent to one of the inputs. In this case, we have to
12232 // swap an input with a non-input.
12233 for (int i = 0; i < 4; ++i)
12234 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
12235 "We can't handle any clobbers here!");
12236 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
12237 "Cannot have adjacent inputs here!");
12239 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12240 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
12242 // We also have to update the final source mask in this case because
12243 // it may need to undo the above swap.
12244 for (int &M : FinalSourceHalfMask)
12245 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
12246 M = InputsFixed[1] + SourceOffset;
12247 else if (M == InputsFixed[1] + SourceOffset)
12248 M = (InputsFixed[0] ^ 1) + SourceOffset;
12250 InputsFixed[1] = InputsFixed[0] ^ 1;
12253 // Point everything at the fixed inputs.
12254 for (int &M : HalfMask)
12255 if (M == IncomingInputs[0])
12256 M = InputsFixed[0] + SourceOffset;
12257 else if (M == IncomingInputs[1])
12258 M = InputsFixed[1] + SourceOffset;
12260 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
12261 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
12264 llvm_unreachable("Unhandled input size!");
12267 // Now hoist the DWord down to the right half.
12268 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
12269 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
12270 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
12271 for (int &M : HalfMask)
12272 for (int Input : IncomingInputs)
12274 M = FreeDWord * 2 + Input % 2;
12276 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
12277 /*SourceOffset*/ 4, /*DestOffset*/ 0);
12278 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
12279 /*SourceOffset*/ 0, /*DestOffset*/ 4);
12281 // Now enact all the shuffles we've computed to move the inputs into their
12283 if (!isNoopShuffleMask(PSHUFLMask))
12284 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12285 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
12286 if (!isNoopShuffleMask(PSHUFHMask))
12287 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12288 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
12289 if (!isNoopShuffleMask(PSHUFDMask))
12290 V = DAG.getBitcast(
12292 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12293 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12295 // At this point, each half should contain all its inputs, and we can then
12296 // just shuffle them into their final position.
12297 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
12298 "Failed to lift all the high half inputs to the low mask!");
12299 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
12300 "Failed to lift all the low half inputs to the high mask!");
12302 // Do a half shuffle for the low mask.
12303 if (!isNoopShuffleMask(LoMask))
12304 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12305 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
12307 // Do a half shuffle with the high mask after shifting its values down.
12308 for (int &M : HiMask)
12311 if (!isNoopShuffleMask(HiMask))
12312 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12313 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
12318 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
12319 /// blend if only one input is used.
12320 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
12321 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12322 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
12324 SDValue V1Mask[16];
12325 SDValue V2Mask[16];
12329 int Size = Mask.size();
12330 int Scale = 16 / Size;
12331 for (int i = 0; i < 16; ++i) {
12332 if (Mask[i / Scale] < 0) {
12333 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
12335 const int ZeroMask = 0x80;
12336 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
12338 int V2Idx = Mask[i / Scale] < Size
12340 : (Mask[i / Scale] - Size) * Scale + i % Scale;
12341 if (Zeroable[i / Scale])
12342 V1Idx = V2Idx = ZeroMask;
12343 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
12344 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
12345 V1InUse |= (ZeroMask != V1Idx);
12346 V2InUse |= (ZeroMask != V2Idx);
12351 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12352 DAG.getBitcast(MVT::v16i8, V1),
12353 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
12355 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12356 DAG.getBitcast(MVT::v16i8, V2),
12357 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
12359 // If we need shuffled inputs from both, blend the two.
12361 if (V1InUse && V2InUse)
12362 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
12364 V = V1InUse ? V1 : V2;
12366 // Cast the result back to the correct type.
12367 return DAG.getBitcast(VT, V);
12370 /// Generic lowering of 8-lane i16 shuffles.
12372 /// This handles both single-input shuffles and combined shuffle/blends with
12373 /// two inputs. The single input shuffles are immediately delegated to
12374 /// a dedicated lowering routine.
12376 /// The blends are lowered in one of three fundamental ways. If there are few
12377 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
12378 /// of the input is significantly cheaper when lowered as an interleaving of
12379 /// the two inputs, try to interleave them. Otherwise, blend the low and high
12380 /// halves of the inputs separately (making them have relatively few inputs)
12381 /// and then concatenate them.
12382 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12383 const APInt &Zeroable,
12384 SDValue V1, SDValue V2,
12385 const X86Subtarget &Subtarget,
12386 SelectionDAG &DAG) {
12387 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12388 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12389 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12391 // Whenever we can lower this as a zext, that instruction is strictly faster
12392 // than any alternative.
12393 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12394 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12397 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
12399 if (NumV2Inputs == 0) {
12400 // Check for being able to broadcast a single element.
12401 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12402 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12405 // Try to use shift instructions.
12406 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
12407 Zeroable, Subtarget, DAG))
12410 // Use dedicated unpack instructions for masks that match their pattern.
12412 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12415 // Use dedicated pack instructions for masks that match their pattern.
12416 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
12420 // Try to use byte rotation instructions.
12421 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
12422 Mask, Subtarget, DAG))
12425 // Make a copy of the mask so it can be modified.
12426 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
12427 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
12428 MutableMask, Subtarget,
12432 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
12433 "All single-input shuffles should be canonicalized to be V1-input "
12436 // Try to use shift instructions.
12437 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
12438 Zeroable, Subtarget, DAG))
12441 // See if we can use SSE4A Extraction / Insertion.
12442 if (Subtarget.hasSSE4A())
12443 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
12447 // There are special ways we can lower some single-element blends.
12448 if (NumV2Inputs == 1)
12449 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12450 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12453 // We have different paths for blend lowering, but they all must use the
12454 // *exact* same predicate.
12455 bool IsBlendSupported = Subtarget.hasSSE41();
12456 if (IsBlendSupported)
12457 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
12458 Zeroable, Subtarget, DAG))
12461 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
12465 // Use dedicated unpack instructions for masks that match their pattern.
12467 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12470 // Use dedicated pack instructions for masks that match their pattern.
12471 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
12475 // Try to use byte rotation instructions.
12476 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12477 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12480 if (SDValue BitBlend =
12481 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
12484 // Try to lower by permuting the inputs into an unpack instruction.
12485 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
12489 // If we can't directly blend but can use PSHUFB, that will be better as it
12490 // can both shuffle and set up the inefficient blend.
12491 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
12492 bool V1InUse, V2InUse;
12493 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
12494 Zeroable, DAG, V1InUse, V2InUse);
12497 // We can always bit-blend if we have to so the fallback strategy is to
12498 // decompose into single-input permutes and blends.
12499 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
12503 /// Check whether a compaction lowering can be done by dropping even
12504 /// elements and compute how many times even elements must be dropped.
12506 /// This handles shuffles which take every Nth element where N is a power of
12507 /// two. Example shuffle masks:
12509 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12510 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12511 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12512 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12513 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12514 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12516 /// Any of these lanes can of course be undef.
12518 /// This routine only supports N <= 3.
12519 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12522 /// \returns N above, or the number of times even elements must be dropped if
12523 /// there is such a number. Otherwise returns zero.
12524 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
12525 bool IsSingleInput) {
12526 // The modulus for the shuffle vector entries is based on whether this is
12527 // a single input or not.
12528 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12529 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
12530 "We should only be called with masks with a power-of-2 size!");
12532 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12534 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12535 // and 2^3 simultaneously. This is because we may have ambiguity with
12536 // partially undef inputs.
12537 bool ViableForN[3] = {true, true, true};
12539 for (int i = 0, e = Mask.size(); i < e; ++i) {
12540 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12545 bool IsAnyViable = false;
12546 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12547 if (ViableForN[j]) {
12548 uint64_t N = j + 1;
12550 // The shuffle mask must be equal to (i * 2^N) % M.
12551 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12552 IsAnyViable = true;
12554 ViableForN[j] = false;
12556 // Early exit if we exhaust the possible powers of two.
12561 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12565 // Return 0 as there is no viable power of two.
12569 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12570 ArrayRef<int> Mask, SDValue V1,
12571 SDValue V2, SelectionDAG &DAG) {
12572 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12573 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12575 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12577 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12579 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12582 /// Generic lowering of v16i8 shuffles.
12584 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
12585 /// detect any complexity reducing interleaving. If that doesn't help, it uses
12586 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
12587 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
12589 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12590 const APInt &Zeroable,
12591 SDValue V1, SDValue V2,
12592 const X86Subtarget &Subtarget,
12593 SelectionDAG &DAG) {
12594 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12595 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12596 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12598 // Try to use shift instructions.
12599 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
12600 Zeroable, Subtarget, DAG))
12603 // Try to use byte rotation instructions.
12604 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12605 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12608 // Use dedicated pack instructions for masks that match their pattern.
12609 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
12613 // Try to use a zext lowering.
12614 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12615 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12618 // See if we can use SSE4A Extraction / Insertion.
12619 if (Subtarget.hasSSE4A())
12620 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12624 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12626 // For single-input shuffles, there are some nicer lowering tricks we can use.
12627 if (NumV2Elements == 0) {
12628 // Check for being able to broadcast a single element.
12629 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12630 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12633 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12634 // Notably, this handles splat and partial-splat shuffles more efficiently.
12635 // However, it only makes sense if the pre-duplication shuffle simplifies
12636 // things significantly. Currently, this means we need to be able to
12637 // express the pre-duplication shuffle as an i16 shuffle.
12639 // FIXME: We should check for other patterns which can be widened into an
12640 // i16 shuffle as well.
12641 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12642 for (int i = 0; i < 16; i += 2)
12643 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12648 auto tryToWidenViaDuplication = [&]() -> SDValue {
12649 if (!canWidenViaDuplication(Mask))
12651 SmallVector<int, 4> LoInputs;
12652 copy_if(Mask, std::back_inserter(LoInputs),
12653 [](int M) { return M >= 0 && M < 8; });
12654 array_pod_sort(LoInputs.begin(), LoInputs.end());
12655 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12657 SmallVector<int, 4> HiInputs;
12658 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12659 array_pod_sort(HiInputs.begin(), HiInputs.end());
12660 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12663 bool TargetLo = LoInputs.size() >= HiInputs.size();
12664 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12665 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12667 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12668 SmallDenseMap<int, int, 8> LaneMap;
12669 for (int I : InPlaceInputs) {
12670 PreDupI16Shuffle[I/2] = I/2;
12673 int j = TargetLo ? 0 : 4, je = j + 4;
12674 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12675 // Check if j is already a shuffle of this input. This happens when
12676 // there are two adjacent bytes after we move the low one.
12677 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12678 // If we haven't yet mapped the input, search for a slot into which
12680 while (j < je && PreDupI16Shuffle[j] >= 0)
12684 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12687 // Map this input with the i16 shuffle.
12688 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12691 // Update the lane map based on the mapping we ended up with.
12692 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12694 V1 = DAG.getBitcast(
12696 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12697 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12699 // Unpack the bytes to form the i16s that will be shuffled into place.
12700 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12701 MVT::v16i8, V1, V1);
12703 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12704 for (int i = 0; i < 16; ++i)
12705 if (Mask[i] >= 0) {
12706 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12707 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12708 if (PostDupI16Shuffle[i / 2] < 0)
12709 PostDupI16Shuffle[i / 2] = MappedMask;
12711 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12712 "Conflicting entries in the original shuffle!");
12714 return DAG.getBitcast(
12716 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12717 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12719 if (SDValue V = tryToWidenViaDuplication())
12723 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12727 // Use dedicated unpack instructions for masks that match their pattern.
12729 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12732 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12733 // with PSHUFB. It is important to do this before we attempt to generate any
12734 // blends but after all of the single-input lowerings. If the single input
12735 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12736 // want to preserve that and we can DAG combine any longer sequences into
12737 // a PSHUFB in the end. But once we start blending from multiple inputs,
12738 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12739 // and there are *very* few patterns that would actually be faster than the
12740 // PSHUFB approach because of its ability to zero lanes.
12742 // FIXME: The only exceptions to the above are blends which are exact
12743 // interleavings with direct instructions supporting them. We currently don't
12744 // handle those well here.
12745 if (Subtarget.hasSSSE3()) {
12746 bool V1InUse = false;
12747 bool V2InUse = false;
12749 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12750 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12752 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12753 // do so. This avoids using them to handle blends-with-zero which is
12754 // important as a single pshufb is significantly faster for that.
12755 if (V1InUse && V2InUse) {
12756 if (Subtarget.hasSSE41())
12757 if (SDValue Blend = lowerVectorShuffleAsBlend(
12758 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12761 // We can use an unpack to do the blending rather than an or in some
12762 // cases. Even though the or may be (very minorly) more efficient, we
12763 // preference this lowering because there are common cases where part of
12764 // the complexity of the shuffles goes away when we do the final blend as
12766 // FIXME: It might be worth trying to detect if the unpack-feeding
12767 // shuffles will both be pshufb, in which case we shouldn't bother with
12769 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12770 DL, MVT::v16i8, V1, V2, Mask, DAG))
12773 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
12774 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
12775 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
12781 // There are special ways we can lower some single-element blends.
12782 if (NumV2Elements == 1)
12783 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12784 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12787 if (SDValue BitBlend =
12788 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12791 // Check whether a compaction lowering can be done. This handles shuffles
12792 // which take every Nth element for some even N. See the helper function for
12795 // We special case these as they can be particularly efficiently handled with
12796 // the PACKUSB instruction on x86 and they show up in common patterns of
12797 // rearranging bytes to truncate wide elements.
12798 bool IsSingleInput = V2.isUndef();
12799 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12800 // NumEvenDrops is the power of two stride of the elements. Another way of
12801 // thinking about it is that we need to drop the even elements this many
12802 // times to get the original input.
12804 // First we need to zero all the dropped bytes.
12805 assert(NumEvenDrops <= 3 &&
12806 "No support for dropping even elements more than 3 times.");
12807 // We use the mask type to pick which bytes are preserved based on how many
12808 // elements are dropped.
12809 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12810 SDValue ByteClearMask = DAG.getBitcast(
12811 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12812 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12813 if (!IsSingleInput)
12814 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12816 // Now pack things back together.
12817 V1 = DAG.getBitcast(MVT::v8i16, V1);
12818 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12819 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12820 for (int i = 1; i < NumEvenDrops; ++i) {
12821 Result = DAG.getBitcast(MVT::v8i16, Result);
12822 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12828 // Handle multi-input cases by blending single-input shuffles.
12829 if (NumV2Elements > 0)
12830 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12833 // The fallback path for single-input shuffles widens this into two v8i16
12834 // vectors with unpacks, shuffles those, and then pulls them back together
12838 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12839 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12840 for (int i = 0; i < 16; ++i)
12842 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12844 SDValue VLoHalf, VHiHalf;
12845 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12846 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12848 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12849 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12850 // Use a mask to drop the high bytes.
12851 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12852 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12853 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12855 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12856 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12858 // Squash the masks to point directly into VLoHalf.
12859 for (int &M : LoBlendMask)
12862 for (int &M : HiBlendMask)
12866 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12867 // VHiHalf so that we can blend them as i16s.
12868 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12870 VLoHalf = DAG.getBitcast(
12871 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12872 VHiHalf = DAG.getBitcast(
12873 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12876 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12877 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12879 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12882 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
12884 /// This routine breaks down the specific type of 128-bit shuffle and
12885 /// dispatches to the lowering routines accordingly.
12886 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12887 MVT VT, SDValue V1, SDValue V2,
12888 const APInt &Zeroable,
12889 const X86Subtarget &Subtarget,
12890 SelectionDAG &DAG) {
12891 switch (VT.SimpleTy) {
12893 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12895 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12897 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12899 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12901 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12903 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12906 llvm_unreachable("Unimplemented!");
12910 /// Generic routine to split vector shuffle into half-sized shuffles.
12912 /// This routine just extracts two subvectors, shuffles them independently, and
12913 /// then concatenates them back together. This should work effectively with all
12914 /// AVX vector shuffle types.
12915 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12916 SDValue V2, ArrayRef<int> Mask,
12917 SelectionDAG &DAG) {
12918 assert(VT.getSizeInBits() >= 256 &&
12919 "Only for 256-bit or wider vector shuffles!");
12920 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12921 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12923 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12924 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12926 int NumElements = VT.getVectorNumElements();
12927 int SplitNumElements = NumElements / 2;
12928 MVT ScalarVT = VT.getVectorElementType();
12929 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12931 // Rather than splitting build-vectors, just build two narrower build
12932 // vectors. This helps shuffling with splats and zeros.
12933 auto SplitVector = [&](SDValue V) {
12934 V = peekThroughBitcasts(V);
12936 MVT OrigVT = V.getSimpleValueType();
12937 int OrigNumElements = OrigVT.getVectorNumElements();
12938 int OrigSplitNumElements = OrigNumElements / 2;
12939 MVT OrigScalarVT = OrigVT.getVectorElementType();
12940 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12944 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12946 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12947 DAG.getIntPtrConstant(0, DL));
12948 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12949 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12952 SmallVector<SDValue, 16> LoOps, HiOps;
12953 for (int i = 0; i < OrigSplitNumElements; ++i) {
12954 LoOps.push_back(BV->getOperand(i));
12955 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12957 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12958 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12960 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12961 DAG.getBitcast(SplitVT, HiV));
12964 SDValue LoV1, HiV1, LoV2, HiV2;
12965 std::tie(LoV1, HiV1) = SplitVector(V1);
12966 std::tie(LoV2, HiV2) = SplitVector(V2);
12968 // Now create two 4-way blends of these half-width vectors.
12969 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12970 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12971 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12972 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12973 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12974 for (int i = 0; i < SplitNumElements; ++i) {
12975 int M = HalfMask[i];
12976 if (M >= NumElements) {
12977 if (M >= NumElements + SplitNumElements)
12981 V2BlendMask[i] = M - NumElements;
12982 BlendMask[i] = SplitNumElements + i;
12983 } else if (M >= 0) {
12984 if (M >= SplitNumElements)
12988 V1BlendMask[i] = M;
12993 // Because the lowering happens after all combining takes place, we need to
12994 // manually combine these blend masks as much as possible so that we create
12995 // a minimal number of high-level vector shuffle nodes.
12997 // First try just blending the halves of V1 or V2.
12998 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12999 return DAG.getUNDEF(SplitVT);
13000 if (!UseLoV2 && !UseHiV2)
13001 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
13002 if (!UseLoV1 && !UseHiV1)
13003 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
13005 SDValue V1Blend, V2Blend;
13006 if (UseLoV1 && UseHiV1) {
13008 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
13010 // We only use half of V1 so map the usage down into the final blend mask.
13011 V1Blend = UseLoV1 ? LoV1 : HiV1;
13012 for (int i = 0; i < SplitNumElements; ++i)
13013 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
13014 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
13016 if (UseLoV2 && UseHiV2) {
13018 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
13020 // We only use half of V2 so map the usage down into the final blend mask.
13021 V2Blend = UseLoV2 ? LoV2 : HiV2;
13022 for (int i = 0; i < SplitNumElements; ++i)
13023 if (BlendMask[i] >= SplitNumElements)
13024 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
13026 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
13028 SDValue Lo = HalfBlend(LoMask);
13029 SDValue Hi = HalfBlend(HiMask);
13030 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13033 /// Either split a vector in halves or decompose the shuffles and the
13036 /// This is provided as a good fallback for many lowerings of non-single-input
13037 /// shuffles with more than one 128-bit lane. In those cases, we want to select
13038 /// between splitting the shuffle into 128-bit components and stitching those
13039 /// back together vs. extracting the single-input shuffles and blending those
13041 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
13042 SDValue V1, SDValue V2,
13043 ArrayRef<int> Mask,
13044 SelectionDAG &DAG) {
13045 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
13046 "shuffles as it could then recurse on itself.");
13047 int Size = Mask.size();
13049 // If this can be modeled as a broadcast of two elements followed by a blend,
13050 // prefer that lowering. This is especially important because broadcasts can
13051 // often fold with memory operands.
13052 auto DoBothBroadcast = [&] {
13053 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
13056 if (V2BroadcastIdx < 0)
13057 V2BroadcastIdx = M - Size;
13058 else if (M - Size != V2BroadcastIdx)
13060 } else if (M >= 0) {
13061 if (V1BroadcastIdx < 0)
13062 V1BroadcastIdx = M;
13063 else if (M != V1BroadcastIdx)
13068 if (DoBothBroadcast())
13069 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
13072 // If the inputs all stem from a single 128-bit lane of each input, then we
13073 // split them rather than blending because the split will decompose to
13074 // unusually few instructions.
13075 int LaneCount = VT.getSizeInBits() / 128;
13076 int LaneSize = Size / LaneCount;
13077 SmallBitVector LaneInputs[2];
13078 LaneInputs[0].resize(LaneCount, false);
13079 LaneInputs[1].resize(LaneCount, false);
13080 for (int i = 0; i < Size; ++i)
13082 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
13083 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
13084 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13086 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
13087 // that the decomposed single-input shuffles don't end up here.
13088 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
13091 /// Lower a vector shuffle crossing multiple 128-bit lanes as
13092 /// a permutation and blend of those lanes.
13094 /// This essentially blends the out-of-lane inputs to each lane into the lane
13095 /// from a permuted copy of the vector. This lowering strategy results in four
13096 /// instructions in the worst case for a single-input cross lane shuffle which
13097 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
13098 /// of. Special cases for each particular shuffle pattern should be handled
13099 /// prior to trying this lowering.
13100 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
13101 SDValue V1, SDValue V2,
13102 ArrayRef<int> Mask,
13104 const X86Subtarget &Subtarget) {
13105 // FIXME: This should probably be generalized for 512-bit vectors as well.
13106 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
13107 int Size = Mask.size();
13108 int LaneSize = Size / 2;
13110 // If there are only inputs from one 128-bit lane, splitting will in fact be
13111 // less expensive. The flags track whether the given lane contains an element
13112 // that crosses to another lane.
13113 if (!Subtarget.hasAVX2()) {
13114 bool LaneCrossing[2] = {false, false};
13115 for (int i = 0; i < Size; ++i)
13116 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
13117 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
13118 if (!LaneCrossing[0] || !LaneCrossing[1])
13119 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13121 bool LaneUsed[2] = {false, false};
13122 for (int i = 0; i < Size; ++i)
13124 LaneUsed[(Mask[i] / LaneSize)] = true;
13125 if (!LaneUsed[0] || !LaneUsed[1])
13126 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13129 assert(V2.isUndef() &&
13130 "This last part of this routine only works on single input shuffles");
13132 SmallVector<int, 32> FlippedBlendMask(Size);
13133 for (int i = 0; i < Size; ++i)
13134 FlippedBlendMask[i] =
13135 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
13137 : Mask[i] % LaneSize +
13138 (i / LaneSize) * LaneSize + Size);
13140 // Flip the vector, and blend the results which should now be in-lane.
13141 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
13142 SDValue Flipped = DAG.getBitcast(PVT, V1);
13143 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
13145 Flipped = DAG.getBitcast(VT, Flipped);
13146 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
13149 /// Handle lowering 2-lane 128-bit shuffles.
13150 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
13151 SDValue V2, ArrayRef<int> Mask,
13152 const APInt &Zeroable,
13153 const X86Subtarget &Subtarget,
13154 SelectionDAG &DAG) {
13155 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
13156 if (Subtarget.hasAVX2() && V2.isUndef())
13159 SmallVector<int, 4> WidenedMask;
13160 if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
13163 bool IsLowZero = (Zeroable & 0x3) == 0x3;
13164 bool IsHighZero = (Zeroable & 0xc) == 0xc;
13166 // Try to use an insert into a zero vector.
13167 if (WidenedMask[0] == 0 && IsHighZero) {
13168 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13169 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13170 DAG.getIntPtrConstant(0, DL));
13171 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
13172 getZeroVector(VT, Subtarget, DAG, DL), LoV,
13173 DAG.getIntPtrConstant(0, DL));
13176 // TODO: If minimizing size and one of the inputs is a zero vector and the
13177 // the zero vector has only one use, we could use a VPERM2X128 to save the
13178 // instruction bytes needed to explicitly generate the zero vector.
13180 // Blends are faster and handle all the non-lane-crossing cases.
13181 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
13182 Zeroable, Subtarget, DAG))
13185 // If either input operand is a zero vector, use VPERM2X128 because its mask
13186 // allows us to replace the zero input with an implicit zero.
13187 if (!IsLowZero && !IsHighZero) {
13188 // Check for patterns which can be matched with a single insert of a 128-bit
13190 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
13191 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
13193 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
13194 // this will likely become vinsertf128 which can't fold a 256-bit memop.
13195 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
13196 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13197 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13198 OnlyUsesV1 ? V1 : V2,
13199 DAG.getIntPtrConstant(0, DL));
13200 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
13201 DAG.getIntPtrConstant(2, DL));
13205 // Try to use SHUF128 if possible.
13206 if (Subtarget.hasVLX()) {
13207 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
13208 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
13209 ((WidenedMask[1] % 2) << 1);
13210 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
13211 DAG.getConstant(PermMask, DL, MVT::i8));
13216 // Otherwise form a 128-bit permutation. After accounting for undefs,
13217 // convert the 64-bit shuffle mask selection values into 128-bit
13218 // selection bits by dividing the indexes by 2 and shifting into positions
13219 // defined by a vperm2*128 instruction's immediate control byte.
13221 // The immediate permute control byte looks like this:
13222 // [1:0] - select 128 bits from sources for low half of destination
13224 // [3] - zero low half of destination
13225 // [5:4] - select 128 bits from sources for high half of destination
13227 // [7] - zero high half of destination
13229 assert((WidenedMask[0] >= 0 || IsLowZero) &&
13230 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
13232 unsigned PermMask = 0;
13233 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
13234 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
13236 // Check the immediate mask and replace unused sources with undef.
13237 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
13238 V1 = DAG.getUNDEF(VT);
13239 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
13240 V2 = DAG.getUNDEF(VT);
13242 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
13243 DAG.getConstant(PermMask, DL, MVT::i8));
13246 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
13247 /// shuffling each lane.
13249 /// This will only succeed when the result of fixing the 128-bit lanes results
13250 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
13251 /// each 128-bit lanes. This handles many cases where we can quickly blend away
13252 /// the lane crosses early and then use simpler shuffles within each lane.
13254 /// FIXME: It might be worthwhile at some point to support this without
13255 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
13256 /// in x86 only floating point has interesting non-repeating shuffles, and even
13257 /// those are still *marginally* more expensive.
13258 static SDValue lowerVectorShuffleByMerging128BitLanes(
13259 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13260 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13261 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
13263 int Size = Mask.size();
13264 int LaneSize = 128 / VT.getScalarSizeInBits();
13265 int NumLanes = Size / LaneSize;
13266 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
13268 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
13269 // check whether the in-128-bit lane shuffles share a repeating pattern.
13270 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
13271 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
13272 for (int i = 0; i < Size; ++i) {
13276 int j = i / LaneSize;
13278 if (Lanes[j] < 0) {
13279 // First entry we've seen for this lane.
13280 Lanes[j] = Mask[i] / LaneSize;
13281 } else if (Lanes[j] != Mask[i] / LaneSize) {
13282 // This doesn't match the lane selected previously!
13286 // Check that within each lane we have a consistent shuffle mask.
13287 int k = i % LaneSize;
13288 if (InLaneMask[k] < 0) {
13289 InLaneMask[k] = Mask[i] % LaneSize;
13290 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
13291 // This doesn't fit a repeating in-lane mask.
13296 // First shuffle the lanes into place.
13297 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
13298 VT.getSizeInBits() / 64);
13299 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
13300 for (int i = 0; i < NumLanes; ++i)
13301 if (Lanes[i] >= 0) {
13302 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
13303 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
13306 V1 = DAG.getBitcast(LaneVT, V1);
13307 V2 = DAG.getBitcast(LaneVT, V2);
13308 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
13310 // Cast it back to the type we actually want.
13311 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
13313 // Now do a simple shuffle that isn't lane crossing.
13314 SmallVector<int, 8> NewMask((unsigned)Size, -1);
13315 for (int i = 0; i < Size; ++i)
13317 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
13318 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
13319 "Must not introduce lane crosses at this point!");
13321 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
13324 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
13325 /// This allows for fast cases such as subvector extraction/insertion
13326 /// or shuffling smaller vector types which can lower more efficiently.
13327 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
13328 SDValue V1, SDValue V2,
13329 ArrayRef<int> Mask,
13330 const X86Subtarget &Subtarget,
13331 SelectionDAG &DAG) {
13332 assert((VT.is256BitVector() || VT.is512BitVector()) &&
13333 "Expected 256-bit or 512-bit vector");
13335 unsigned NumElts = VT.getVectorNumElements();
13336 unsigned HalfNumElts = NumElts / 2;
13337 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
13339 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
13340 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
13341 if (!UndefLower && !UndefUpper)
13344 // Upper half is undef and lower half is whole upper subvector.
13345 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
13347 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
13348 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13349 DAG.getIntPtrConstant(HalfNumElts, DL));
13350 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13351 DAG.getIntPtrConstant(0, DL));
13354 // Lower half is undef and upper half is whole lower subvector.
13355 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
13357 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
13358 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13359 DAG.getIntPtrConstant(0, DL));
13360 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13361 DAG.getIntPtrConstant(HalfNumElts, DL));
13364 // If the shuffle only uses two of the four halves of the input operands,
13365 // then extract them and perform the 'half' shuffle at half width.
13366 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
13367 int HalfIdx1 = -1, HalfIdx2 = -1;
13368 SmallVector<int, 8> HalfMask(HalfNumElts);
13369 unsigned Offset = UndefLower ? HalfNumElts : 0;
13370 for (unsigned i = 0; i != HalfNumElts; ++i) {
13371 int M = Mask[i + Offset];
13377 // Determine which of the 4 half vectors this element is from.
13378 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
13379 int HalfIdx = M / HalfNumElts;
13381 // Determine the element index into its half vector source.
13382 int HalfElt = M % HalfNumElts;
13384 // We can shuffle with up to 2 half vectors, set the new 'half'
13385 // shuffle mask accordingly.
13386 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
13387 HalfMask[i] = HalfElt;
13388 HalfIdx1 = HalfIdx;
13391 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
13392 HalfMask[i] = HalfElt + HalfNumElts;
13393 HalfIdx2 = HalfIdx;
13397 // Too many half vectors referenced.
13400 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
13402 // Only shuffle the halves of the inputs when useful.
13403 int NumLowerHalves =
13404 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
13405 int NumUpperHalves =
13406 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
13408 // uuuuXXXX - don't extract uppers just to insert again.
13409 if (UndefLower && NumUpperHalves != 0)
13412 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
13413 if (UndefUpper && NumUpperHalves == 2)
13416 // AVX2 - XXXXuuuu - always extract lowers.
13417 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
13418 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
13419 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13421 // AVX2 supports variable 32-bit element cross-lane shuffles.
13422 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
13423 // XXXXuuuu - don't extract lowers and uppers.
13424 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
13429 // AVX512 - XXXXuuuu - always extract lowers.
13430 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
13433 auto GetHalfVector = [&](int HalfIdx) {
13435 return DAG.getUNDEF(HalfVT);
13436 SDValue V = (HalfIdx < 2 ? V1 : V2);
13437 HalfIdx = (HalfIdx % 2) * HalfNumElts;
13438 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
13439 DAG.getIntPtrConstant(HalfIdx, DL));
13442 SDValue Half1 = GetHalfVector(HalfIdx1);
13443 SDValue Half2 = GetHalfVector(HalfIdx2);
13444 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
13445 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
13446 DAG.getIntPtrConstant(Offset, DL));
13449 /// Test whether the specified input (0 or 1) is in-place blended by the
13452 /// This returns true if the elements from a particular input are already in the
13453 /// slot required by the given mask and require no permutation.
13454 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
13455 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13456 int Size = Mask.size();
13457 for (int i = 0; i < Size; ++i)
13458 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13464 /// Handle case where shuffle sources are coming from the same 128-bit lane and
13465 /// every lane can be represented as the same repeating mask - allowing us to
13466 /// shuffle the sources with the repeating shuffle and then permute the result
13467 /// to the destination lanes.
13468 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
13469 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13470 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13471 int NumElts = VT.getVectorNumElements();
13472 int NumLanes = VT.getSizeInBits() / 128;
13473 int NumLaneElts = NumElts / NumLanes;
13475 // On AVX2 we may be able to just shuffle the lowest elements and then
13476 // broadcast the result.
13477 if (Subtarget.hasAVX2()) {
13478 for (unsigned BroadcastSize : {16, 32, 64}) {
13479 if (BroadcastSize <= VT.getScalarSizeInBits())
13481 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
13483 // Attempt to match a repeating pattern every NumBroadcastElts,
13484 // accounting for UNDEFs but only references the lowest 128-bit
13485 // lane of the inputs.
13486 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
13487 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13488 for (int j = 0; j != NumBroadcastElts; ++j) {
13489 int M = Mask[i + j];
13492 int &R = RepeatMask[j];
13493 if (0 != ((M % NumElts) / NumLaneElts))
13495 if (0 <= R && R != M)
13502 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
13503 if (!FindRepeatingBroadcastMask(RepeatMask))
13506 // Shuffle the (lowest) repeated elements in place for broadcast.
13507 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
13509 // Shuffle the actual broadcast.
13510 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
13511 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13512 for (int j = 0; j != NumBroadcastElts; ++j)
13513 BroadcastMask[i + j] = j;
13514 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
13519 // Bail if the shuffle mask doesn't cross 128-bit lanes.
13520 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
13523 // Bail if we already have a repeated lane shuffle mask.
13524 SmallVector<int, 8> RepeatedShuffleMask;
13525 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
13528 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
13529 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
13530 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
13531 int NumSubLanes = NumLanes * SubLaneScale;
13532 int NumSubLaneElts = NumLaneElts / SubLaneScale;
13534 // Check that all the sources are coming from the same lane and see if we can
13535 // form a repeating shuffle mask (local to each sub-lane). At the same time,
13536 // determine the source sub-lane for each destination sub-lane.
13537 int TopSrcSubLane = -1;
13538 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
13539 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
13540 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
13541 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
13543 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
13544 // Extract the sub-lane mask, check that it all comes from the same lane
13545 // and normalize the mask entries to come from the first lane.
13547 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
13548 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13549 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
13552 int Lane = (M % NumElts) / NumLaneElts;
13553 if ((0 <= SrcLane) && (SrcLane != Lane))
13556 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
13557 SubLaneMask[Elt] = LocalM;
13560 // Whole sub-lane is UNDEF.
13564 // Attempt to match against the candidate repeated sub-lane masks.
13565 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
13566 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
13567 for (int i = 0; i != NumSubLaneElts; ++i) {
13568 if (M1[i] < 0 || M2[i] < 0)
13570 if (M1[i] != M2[i])
13576 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
13577 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
13580 // Merge the sub-lane mask into the matching repeated sub-lane mask.
13581 for (int i = 0; i != NumSubLaneElts; ++i) {
13582 int M = SubLaneMask[i];
13585 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
13586 "Unexpected mask element");
13587 RepeatedSubLaneMask[i] = M;
13590 // Track the top most source sub-lane - by setting the remaining to UNDEF
13591 // we can greatly simplify shuffle matching.
13592 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
13593 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
13594 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
13598 // Bail if we failed to find a matching repeated sub-lane mask.
13599 if (Dst2SrcSubLanes[DstSubLane] < 0)
13602 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
13603 "Unexpected source lane");
13605 // Create a repeating shuffle mask for the entire vector.
13606 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
13607 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
13608 int Lane = SubLane / SubLaneScale;
13609 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
13610 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13611 int M = RepeatedSubLaneMask[Elt];
13614 int Idx = (SubLane * NumSubLaneElts) + Elt;
13615 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
13618 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
13620 // Shuffle each source sub-lane to its destination.
13621 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
13622 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
13623 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
13624 if (SrcSubLane < 0)
13626 for (int j = 0; j != NumSubLaneElts; ++j)
13627 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
13630 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13634 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13635 unsigned &ShuffleImm,
13636 ArrayRef<int> Mask) {
13637 int NumElts = VT.getVectorNumElements();
13638 assert(VT.getScalarSizeInBits() == 64 &&
13639 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13640 "Unexpected data type for VSHUFPD");
13642 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13643 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13645 bool ShufpdMask = true;
13646 bool CommutableMask = true;
13647 for (int i = 0; i < NumElts; ++i) {
13648 if (Mask[i] == SM_SentinelUndef)
13652 int Val = (i & 6) + NumElts * (i & 1);
13653 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13654 if (Mask[i] < Val || Mask[i] > Val + 1)
13655 ShufpdMask = false;
13656 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13657 CommutableMask = false;
13658 ShuffleImm |= (Mask[i] % 2) << i;
13663 if (CommutableMask) {
13671 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13672 ArrayRef<int> Mask, SDValue V1,
13673 SDValue V2, SelectionDAG &DAG) {
13674 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13675 "Unexpected data type for VSHUFPD");
13677 unsigned Immediate = 0;
13678 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13681 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13682 DAG.getConstant(Immediate, DL, MVT::i8));
13685 /// Handle lowering of 4-lane 64-bit floating point shuffles.
13687 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13688 /// isn't available.
13689 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13690 const APInt &Zeroable,
13691 SDValue V1, SDValue V2,
13692 const X86Subtarget &Subtarget,
13693 SelectionDAG &DAG) {
13694 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13695 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13696 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13698 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13699 Zeroable, Subtarget, DAG))
13702 if (V2.isUndef()) {
13703 // Check for being able to broadcast a single element.
13704 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13705 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13708 // Use low duplicate instructions for masks that match their pattern.
13709 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13710 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13712 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13713 // Non-half-crossing single input shuffles can be lowered with an
13714 // interleaved permutation.
13715 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13716 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13717 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13718 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13721 // With AVX2 we have direct support for this permutation.
13722 if (Subtarget.hasAVX2())
13723 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13724 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13726 // Try to create an in-lane repeating shuffle mask and then shuffle the
13727 // results into the target lanes.
13728 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13729 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13732 // Otherwise, fall back.
13733 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13737 // Use dedicated unpack instructions for masks that match their pattern.
13739 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13742 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13743 Zeroable, Subtarget, DAG))
13746 // Check if the blend happens to exactly fit that of SHUFPD.
13748 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13751 // Try to create an in-lane repeating shuffle mask and then shuffle the
13752 // results into the target lanes.
13753 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13754 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13757 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13758 // shuffle. However, if we have AVX2 and either inputs are already in place,
13759 // we will be able to shuffle even across lanes the other input in a single
13760 // instruction so skip this pattern.
13761 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13762 isShuffleMaskInputInPlace(1, Mask))))
13763 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13764 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13766 // If we have VLX support, we can use VEXPAND.
13767 if (Subtarget.hasVLX())
13768 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13769 V1, V2, DAG, Subtarget))
13772 // If we have AVX2 then we always want to lower with a blend because an v4 we
13773 // can fully permute the elements.
13774 if (Subtarget.hasAVX2())
13775 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13778 // Otherwise fall back on generic lowering.
13779 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13782 /// Handle lowering of 4-lane 64-bit integer shuffles.
13784 /// This routine is only called when we have AVX2 and thus a reasonable
13785 /// instruction set for v4i64 shuffling..
13786 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13787 const APInt &Zeroable,
13788 SDValue V1, SDValue V2,
13789 const X86Subtarget &Subtarget,
13790 SelectionDAG &DAG) {
13791 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13792 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13793 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13794 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13796 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13797 Zeroable, Subtarget, DAG))
13800 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13801 Zeroable, Subtarget, DAG))
13804 // Check for being able to broadcast a single element.
13805 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13806 Mask, Subtarget, DAG))
13809 if (V2.isUndef()) {
13810 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13811 // can use lower latency instructions that will operate on both lanes.
13812 SmallVector<int, 2> RepeatedMask;
13813 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13814 SmallVector<int, 4> PSHUFDMask;
13815 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13816 return DAG.getBitcast(
13818 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13819 DAG.getBitcast(MVT::v8i32, V1),
13820 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13823 // AVX2 provides a direct instruction for permuting a single input across
13825 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13826 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13829 // Try to use shift instructions.
13830 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13831 Zeroable, Subtarget, DAG))
13834 // If we have VLX support, we can use VALIGN or VEXPAND.
13835 if (Subtarget.hasVLX()) {
13836 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13837 Mask, Subtarget, DAG))
13840 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13841 V1, V2, DAG, Subtarget))
13845 // Try to use PALIGNR.
13846 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13847 Mask, Subtarget, DAG))
13850 // Use dedicated unpack instructions for masks that match their pattern.
13852 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13855 // Try to create an in-lane repeating shuffle mask and then shuffle the
13856 // results into the target lanes.
13857 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13858 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13861 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13862 // shuffle. However, if we have AVX2 and either inputs are already in place,
13863 // we will be able to shuffle even across lanes the other input in a single
13864 // instruction so skip this pattern.
13865 if (!isShuffleMaskInputInPlace(0, Mask) &&
13866 !isShuffleMaskInputInPlace(1, Mask))
13867 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13868 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13871 // Otherwise fall back on generic blend lowering.
13872 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13876 /// Handle lowering of 8-lane 32-bit floating point shuffles.
13878 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13879 /// isn't available.
13880 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13881 const APInt &Zeroable,
13882 SDValue V1, SDValue V2,
13883 const X86Subtarget &Subtarget,
13884 SelectionDAG &DAG) {
13885 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13886 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13887 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13889 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13890 Zeroable, Subtarget, DAG))
13893 // Check for being able to broadcast a single element.
13894 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13895 Mask, Subtarget, DAG))
13898 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13899 // options to efficiently lower the shuffle.
13900 SmallVector<int, 4> RepeatedMask;
13901 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13902 assert(RepeatedMask.size() == 4 &&
13903 "Repeated masks must be half the mask width!");
13905 // Use even/odd duplicate instructions for masks that match their pattern.
13906 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13907 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13908 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13909 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13912 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13913 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13915 // Use dedicated unpack instructions for masks that match their pattern.
13917 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13920 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13921 // have already handled any direct blends.
13922 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13925 // Try to create an in-lane repeating shuffle mask and then shuffle the
13926 // results into the target lanes.
13927 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13928 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13931 // If we have a single input shuffle with different shuffle patterns in the
13932 // two 128-bit lanes use the variable mask to VPERMILPS.
13933 if (V2.isUndef()) {
13934 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13935 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13936 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13938 if (Subtarget.hasAVX2())
13939 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13941 // Otherwise, fall back.
13942 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13946 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13948 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13949 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13951 // If we have VLX support, we can use VEXPAND.
13952 if (Subtarget.hasVLX())
13953 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13954 V1, V2, DAG, Subtarget))
13957 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13958 // since after split we get a more efficient code using vpunpcklwd and
13959 // vpunpckhwd instrs than vblend.
13960 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13961 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13965 // If we have AVX2 then we always want to lower with a blend because at v8 we
13966 // can fully permute the elements.
13967 if (Subtarget.hasAVX2())
13968 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13971 // Otherwise fall back on generic lowering.
13972 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13975 /// Handle lowering of 8-lane 32-bit integer shuffles.
13977 /// This routine is only called when we have AVX2 and thus a reasonable
13978 /// instruction set for v8i32 shuffling..
13979 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13980 const APInt &Zeroable,
13981 SDValue V1, SDValue V2,
13982 const X86Subtarget &Subtarget,
13983 SelectionDAG &DAG) {
13984 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13985 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13986 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13987 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13989 // Whenever we can lower this as a zext, that instruction is strictly faster
13990 // than any alternative. It also allows us to fold memory operands into the
13991 // shuffle in many cases.
13992 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13993 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13996 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13997 // since after split we get a more efficient code than vblend by using
13998 // vpunpcklwd and vpunpckhwd instrs.
13999 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
14000 !Subtarget.hasAVX512())
14002 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
14005 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
14006 Zeroable, Subtarget, DAG))
14009 // Check for being able to broadcast a single element.
14010 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
14011 Mask, Subtarget, DAG))
14014 // If the shuffle mask is repeated in each 128-bit lane we can use more
14015 // efficient instructions that mirror the shuffles across the two 128-bit
14017 SmallVector<int, 4> RepeatedMask;
14018 bool Is128BitLaneRepeatedShuffle =
14019 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
14020 if (Is128BitLaneRepeatedShuffle) {
14021 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14023 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
14024 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14026 // Use dedicated unpack instructions for masks that match their pattern.
14028 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
14032 // Try to use shift instructions.
14033 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
14034 Zeroable, Subtarget, DAG))
14037 // If we have VLX support, we can use VALIGN or EXPAND.
14038 if (Subtarget.hasVLX()) {
14039 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
14040 Mask, Subtarget, DAG))
14043 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
14044 V1, V2, DAG, Subtarget))
14048 // Try to use byte rotation instructions.
14049 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14050 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14053 // Try to create an in-lane repeating shuffle mask and then shuffle the
14054 // results into the target lanes.
14055 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14056 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14059 // If the shuffle patterns aren't repeated but it is a single input, directly
14060 // generate a cross-lane VPERMD instruction.
14061 if (V2.isUndef()) {
14062 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
14063 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
14066 // Assume that a single SHUFPS is faster than an alternative sequence of
14067 // multiple instructions (even if the CPU has a domain penalty).
14068 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14069 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14070 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
14071 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
14072 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
14073 CastV1, CastV2, DAG);
14074 return DAG.getBitcast(MVT::v8i32, ShufPS);
14077 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14079 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14080 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14083 // Otherwise fall back on generic blend lowering.
14084 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
14088 /// Handle lowering of 16-lane 16-bit integer shuffles.
14090 /// This routine is only called when we have AVX2 and thus a reasonable
14091 /// instruction set for v16i16 shuffling..
14092 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14093 const APInt &Zeroable,
14094 SDValue V1, SDValue V2,
14095 const X86Subtarget &Subtarget,
14096 SelectionDAG &DAG) {
14097 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14098 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14099 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14100 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
14102 // Whenever we can lower this as a zext, that instruction is strictly faster
14103 // than any alternative. It also allows us to fold memory operands into the
14104 // shuffle in many cases.
14105 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14106 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14109 // Check for being able to broadcast a single element.
14110 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
14111 Mask, Subtarget, DAG))
14114 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
14115 Zeroable, Subtarget, DAG))
14118 // Use dedicated unpack instructions for masks that match their pattern.
14120 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
14123 // Use dedicated pack instructions for masks that match their pattern.
14124 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
14128 // Try to use shift instructions.
14129 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
14130 Zeroable, Subtarget, DAG))
14133 // Try to use byte rotation instructions.
14134 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14135 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14138 // Try to create an in-lane repeating shuffle mask and then shuffle the
14139 // results into the target lanes.
14140 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14141 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14144 if (V2.isUndef()) {
14145 // There are no generalized cross-lane shuffle operations available on i16
14147 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
14148 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
14149 Mask, DAG, Subtarget);
14151 SmallVector<int, 8> RepeatedMask;
14152 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
14153 // As this is a single-input shuffle, the repeated mask should be
14154 // a strictly valid v8i16 mask that we can pass through to the v8i16
14155 // lowering to handle even the v16 case.
14156 return lowerV8I16GeneralSingleInputVectorShuffle(
14157 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
14161 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14162 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14165 // AVX512BWVL can lower to VPERMW.
14166 if (Subtarget.hasBWI() && Subtarget.hasVLX())
14167 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
14169 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14171 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14172 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14175 // Otherwise fall back on generic lowering.
14176 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
14179 /// Handle lowering of 32-lane 8-bit integer shuffles.
14181 /// This routine is only called when we have AVX2 and thus a reasonable
14182 /// instruction set for v32i8 shuffling..
14183 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14184 const APInt &Zeroable,
14185 SDValue V1, SDValue V2,
14186 const X86Subtarget &Subtarget,
14187 SelectionDAG &DAG) {
14188 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14189 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14190 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14191 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
14193 // Whenever we can lower this as a zext, that instruction is strictly faster
14194 // than any alternative. It also allows us to fold memory operands into the
14195 // shuffle in many cases.
14196 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14197 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14200 // Check for being able to broadcast a single element.
14201 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
14202 Mask, Subtarget, DAG))
14205 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
14206 Zeroable, Subtarget, DAG))
14209 // Use dedicated unpack instructions for masks that match their pattern.
14211 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
14214 // Use dedicated pack instructions for masks that match their pattern.
14215 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
14219 // Try to use shift instructions.
14220 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
14221 Zeroable, Subtarget, DAG))
14224 // Try to use byte rotation instructions.
14225 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14226 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14229 // Try to create an in-lane repeating shuffle mask and then shuffle the
14230 // results into the target lanes.
14231 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14232 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14235 // There are no generalized cross-lane shuffle operations available on i8
14237 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
14238 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
14241 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14242 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14245 // AVX512VBMIVL can lower to VPERMB.
14246 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
14247 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
14249 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14251 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14252 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14255 // Otherwise fall back on generic lowering.
14256 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
14259 /// High-level routine to lower various 256-bit x86 vector shuffles.
14261 /// This routine either breaks down the specific type of a 256-bit x86 vector
14262 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
14263 /// together based on the available instructions.
14264 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14265 MVT VT, SDValue V1, SDValue V2,
14266 const APInt &Zeroable,
14267 const X86Subtarget &Subtarget,
14268 SelectionDAG &DAG) {
14269 // If we have a single input to the zero element, insert that into V1 if we
14270 // can do so cheaply.
14271 int NumElts = VT.getVectorNumElements();
14272 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14274 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14275 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14276 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14279 // Handle special cases where the lower or upper half is UNDEF.
14281 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14284 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
14285 // can check for those subtargets here and avoid much of the subtarget
14286 // querying in the per-vector-type lowering routines. With AVX1 we have
14287 // essentially *zero* ability to manipulate a 256-bit vector with integer
14288 // types. Since we'll use floating point types there eventually, just
14289 // immediately cast everything to a float and operate entirely in that domain.
14290 if (VT.isInteger() && !Subtarget.hasAVX2()) {
14291 int ElementBits = VT.getScalarSizeInBits();
14292 if (ElementBits < 32) {
14293 // No floating point type available, if we can't use the bit operations
14294 // for masking/blending then decompose into 128-bit vectors.
14296 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
14298 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
14300 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
14303 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
14304 VT.getVectorNumElements());
14305 V1 = DAG.getBitcast(FpVT, V1);
14306 V2 = DAG.getBitcast(FpVT, V2);
14307 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
14310 switch (VT.SimpleTy) {
14312 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14314 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14316 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14318 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14320 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14322 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14325 llvm_unreachable("Not a valid 256-bit x86 vector type!");
14329 /// Try to lower a vector shuffle as a 128-bit shuffles.
14330 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
14331 ArrayRef<int> Mask,
14332 const APInt &Zeroable,
14333 SDValue V1, SDValue V2,
14334 const X86Subtarget &Subtarget,
14335 SelectionDAG &DAG) {
14336 assert(VT.getScalarSizeInBits() == 64 &&
14337 "Unexpected element type size for 128bit shuffle.");
14339 // To handle 256 bit vector requires VLX and most probably
14340 // function lowerV2X128VectorShuffle() is better solution.
14341 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
14343 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
14344 SmallVector<int, 4> WidenedMask;
14345 if (!canWidenShuffleElements(Mask, WidenedMask))
14348 // Try to use an insert into a zero vector.
14349 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
14350 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
14351 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
14352 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
14353 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14354 DAG.getIntPtrConstant(0, DL));
14355 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14356 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14357 DAG.getIntPtrConstant(0, DL));
14360 // Check for patterns which can be matched with a single insert of a 256-bit
14362 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
14363 {0, 1, 2, 3, 0, 1, 2, 3});
14364 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
14365 {0, 1, 2, 3, 8, 9, 10, 11})) {
14366 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
14367 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14368 OnlyUsesV1 ? V1 : V2,
14369 DAG.getIntPtrConstant(0, DL));
14370 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14371 DAG.getIntPtrConstant(4, DL));
14374 assert(WidenedMask.size() == 4);
14376 // See if this is an insertion of the lower 128-bits of V2 into V1.
14377 bool IsInsert = true;
14379 for (int i = 0; i < 4; ++i) {
14380 assert(WidenedMask[i] >= -1);
14381 if (WidenedMask[i] < 0)
14384 // Make sure all V1 subvectors are in place.
14385 if (WidenedMask[i] < 4) {
14386 if (WidenedMask[i] != i) {
14391 // Make sure we only have a single V2 index and its the lowest 128-bits.
14392 if (V2Index >= 0 || WidenedMask[i] != 4) {
14399 if (IsInsert && V2Index >= 0) {
14400 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14401 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
14402 DAG.getIntPtrConstant(0, DL));
14403 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
14406 // Try to lower to vshuf64x2/vshuf32x4.
14407 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
14408 unsigned PermMask = 0;
14409 // Insure elements came from the same Op.
14410 for (int i = 0; i < 4; ++i) {
14411 assert(WidenedMask[i] >= -1);
14412 if (WidenedMask[i] < 0)
14415 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
14416 unsigned OpIndex = i / 2;
14417 if (Ops[OpIndex].isUndef())
14419 else if (Ops[OpIndex] != Op)
14422 // Convert the 128-bit shuffle mask selection values into 128-bit selection
14423 // bits defined by a vshuf64x2 instruction's immediate control byte.
14424 PermMask |= (WidenedMask[i] % 4) << (i * 2);
14427 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
14428 DAG.getConstant(PermMask, DL, MVT::i8));
14431 /// Handle lowering of 8-lane 64-bit floating point shuffles.
14432 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14433 const APInt &Zeroable,
14434 SDValue V1, SDValue V2,
14435 const X86Subtarget &Subtarget,
14436 SelectionDAG &DAG) {
14437 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14438 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14439 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14441 if (V2.isUndef()) {
14442 // Use low duplicate instructions for masks that match their pattern.
14443 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
14444 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
14446 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
14447 // Non-half-crossing single input shuffles can be lowered with an
14448 // interleaved permutation.
14449 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
14450 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
14451 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
14452 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
14453 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
14454 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
14457 SmallVector<int, 4> RepeatedMask;
14458 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
14459 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
14460 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14463 if (SDValue Shuf128 =
14464 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
14468 if (SDValue Unpck =
14469 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
14472 // Check if the blend happens to exactly fit that of SHUFPD.
14474 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
14477 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
14478 V2, DAG, Subtarget))
14481 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
14482 Zeroable, Subtarget, DAG))
14485 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
14488 /// Handle lowering of 16-lane 32-bit floating point shuffles.
14489 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14490 const APInt &Zeroable,
14491 SDValue V1, SDValue V2,
14492 const X86Subtarget &Subtarget,
14493 SelectionDAG &DAG) {
14494 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14495 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14496 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14498 // If the shuffle mask is repeated in each 128-bit lane, we have many more
14499 // options to efficiently lower the shuffle.
14500 SmallVector<int, 4> RepeatedMask;
14501 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
14502 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14504 // Use even/odd duplicate instructions for masks that match their pattern.
14505 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
14506 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
14507 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
14508 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
14511 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
14512 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14514 // Use dedicated unpack instructions for masks that match their pattern.
14515 if (SDValue Unpck =
14516 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
14519 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
14520 Zeroable, Subtarget, DAG))
14523 // Otherwise, fall back to a SHUFPS sequence.
14524 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
14527 // If we have a single input shuffle with different shuffle patterns in the
14528 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
14529 if (V2.isUndef() &&
14530 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
14531 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
14532 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
14535 // If we have AVX512F support, we can use VEXPAND.
14536 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
14537 V1, V2, DAG, Subtarget))
14540 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
14543 /// Handle lowering of 8-lane 64-bit integer shuffles.
14544 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14545 const APInt &Zeroable,
14546 SDValue V1, SDValue V2,
14547 const X86Subtarget &Subtarget,
14548 SelectionDAG &DAG) {
14549 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14550 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14551 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14553 if (V2.isUndef()) {
14554 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
14555 // can use lower latency instructions that will operate on all four
14557 SmallVector<int, 2> Repeated128Mask;
14558 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
14559 SmallVector<int, 4> PSHUFDMask;
14560 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
14561 return DAG.getBitcast(
14563 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
14564 DAG.getBitcast(MVT::v16i32, V1),
14565 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14568 SmallVector<int, 4> Repeated256Mask;
14569 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
14570 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
14571 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
14574 if (SDValue Shuf128 =
14575 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
14576 V1, V2, Subtarget, DAG))
14579 // Try to use shift instructions.
14580 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
14581 Zeroable, Subtarget, DAG))
14584 // Try to use VALIGN.
14585 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
14586 Mask, Subtarget, DAG))
14589 // Try to use PALIGNR.
14590 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
14591 Mask, Subtarget, DAG))
14594 if (SDValue Unpck =
14595 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
14597 // If we have AVX512F support, we can use VEXPAND.
14598 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
14599 V2, DAG, Subtarget))
14602 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
14603 Zeroable, Subtarget, DAG))
14606 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
14609 /// Handle lowering of 16-lane 32-bit integer shuffles.
14610 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14611 const APInt &Zeroable,
14612 SDValue V1, SDValue V2,
14613 const X86Subtarget &Subtarget,
14614 SelectionDAG &DAG) {
14615 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14616 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14617 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14619 // Whenever we can lower this as a zext, that instruction is strictly faster
14620 // than any alternative. It also allows us to fold memory operands into the
14621 // shuffle in many cases.
14622 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14623 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14626 // If the shuffle mask is repeated in each 128-bit lane we can use more
14627 // efficient instructions that mirror the shuffles across the four 128-bit
14629 SmallVector<int, 4> RepeatedMask;
14630 bool Is128BitLaneRepeatedShuffle =
14631 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
14632 if (Is128BitLaneRepeatedShuffle) {
14633 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14635 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
14636 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14638 // Use dedicated unpack instructions for masks that match their pattern.
14640 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
14644 // Try to use shift instructions.
14645 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
14646 Zeroable, Subtarget, DAG))
14649 // Try to use VALIGN.
14650 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14651 Mask, Subtarget, DAG))
14654 // Try to use byte rotation instructions.
14655 if (Subtarget.hasBWI())
14656 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14657 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14660 // Assume that a single SHUFPS is faster than using a permv shuffle.
14661 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14662 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14663 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14664 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14665 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14666 CastV1, CastV2, DAG);
14667 return DAG.getBitcast(MVT::v16i32, ShufPS);
14669 // If we have AVX512F support, we can use VEXPAND.
14670 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14671 V1, V2, DAG, Subtarget))
14674 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14675 Zeroable, Subtarget, DAG))
14677 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14680 /// Handle lowering of 32-lane 16-bit integer shuffles.
14681 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14682 const APInt &Zeroable,
14683 SDValue V1, SDValue V2,
14684 const X86Subtarget &Subtarget,
14685 SelectionDAG &DAG) {
14686 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14687 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14688 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14689 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14691 // Whenever we can lower this as a zext, that instruction is strictly faster
14692 // than any alternative. It also allows us to fold memory operands into the
14693 // shuffle in many cases.
14694 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14695 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14698 // Use dedicated unpack instructions for masks that match their pattern.
14700 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14703 // Try to use shift instructions.
14704 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14705 Zeroable, Subtarget, DAG))
14708 // Try to use byte rotation instructions.
14709 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14710 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14713 if (V2.isUndef()) {
14714 SmallVector<int, 8> RepeatedMask;
14715 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14716 // As this is a single-input shuffle, the repeated mask should be
14717 // a strictly valid v8i16 mask that we can pass through to the v8i16
14718 // lowering to handle even the v32 case.
14719 return lowerV8I16GeneralSingleInputVectorShuffle(
14720 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14724 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14725 Zeroable, Subtarget, DAG))
14728 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14729 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14732 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14735 /// Handle lowering of 64-lane 8-bit integer shuffles.
14736 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14737 const APInt &Zeroable,
14738 SDValue V1, SDValue V2,
14739 const X86Subtarget &Subtarget,
14740 SelectionDAG &DAG) {
14741 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14742 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14743 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14744 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14746 // Whenever we can lower this as a zext, that instruction is strictly faster
14747 // than any alternative. It also allows us to fold memory operands into the
14748 // shuffle in many cases.
14749 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14750 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14753 // Use dedicated unpack instructions for masks that match their pattern.
14755 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14758 // Try to use shift instructions.
14759 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14760 Zeroable, Subtarget, DAG))
14763 // Try to use byte rotation instructions.
14764 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14765 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14768 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14769 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14772 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14773 if (Subtarget.hasVBMI())
14774 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14776 // Try to create an in-lane repeating shuffle mask and then shuffle the
14777 // results into the target lanes.
14778 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14779 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14782 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14783 Zeroable, Subtarget, DAG))
14786 // FIXME: Implement direct support for this type!
14787 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14790 /// High-level routine to lower various 512-bit x86 vector shuffles.
14792 /// This routine either breaks down the specific type of a 512-bit x86 vector
14793 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14794 /// together based on the available instructions.
14795 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14796 MVT VT, SDValue V1, SDValue V2,
14797 const APInt &Zeroable,
14798 const X86Subtarget &Subtarget,
14799 SelectionDAG &DAG) {
14800 assert(Subtarget.hasAVX512() &&
14801 "Cannot lower 512-bit vectors w/ basic ISA!");
14803 // If we have a single input to the zero element, insert that into V1 if we
14804 // can do so cheaply.
14805 int NumElts = Mask.size();
14806 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14808 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14809 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14810 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14813 // Handle special cases where the lower or upper half is UNDEF.
14815 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14818 // Check for being able to broadcast a single element.
14819 if (SDValue Broadcast =
14820 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14823 // Dispatch to each element type for lowering. If we don't have support for
14824 // specific element type shuffles at 512 bits, immediately split them and
14825 // lower them. Each lowering routine of a given type is allowed to assume that
14826 // the requisite ISA extensions for that element type are available.
14827 switch (VT.SimpleTy) {
14829 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14831 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14833 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14835 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14837 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14839 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14842 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14846 // Lower vXi1 vector shuffles.
14847 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14848 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14849 // vector, shuffle and then truncate it back.
14850 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14851 MVT VT, SDValue V1, SDValue V2,
14852 const APInt &Zeroable,
14853 const X86Subtarget &Subtarget,
14854 SelectionDAG &DAG) {
14855 unsigned NumElts = Mask.size();
14857 // Try to recognize shuffles that are just padding a subvector with zeros.
14858 unsigned SubvecElts = 0;
14859 for (int i = 0; i != (int)NumElts; ++i) {
14860 if (Mask[i] >= 0 && Mask[i] != i)
14865 assert(SubvecElts != NumElts && "Identity shuffle?");
14867 // Clip to a power 2.
14868 SubvecElts = PowerOf2Floor(SubvecElts);
14870 // Make sure the number of zeroable bits in the top at least covers the bits
14871 // not covered by the subvector.
14872 if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
14873 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
14874 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
14875 V1, DAG.getIntPtrConstant(0, DL));
14876 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14877 getZeroVector(VT, Subtarget, DAG, DL),
14878 Extract, DAG.getIntPtrConstant(0, DL));
14882 assert(Subtarget.hasAVX512() &&
14883 "Cannot lower 512-bit vectors w/o basic ISA!");
14885 switch (VT.SimpleTy) {
14887 llvm_unreachable("Expected a vector of i1 elements");
14889 ExtVT = MVT::v2i64;
14892 ExtVT = MVT::v4i32;
14895 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
14897 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
14900 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14901 // 256-bit operation available.
14902 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
14905 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14906 // 256-bit operation available.
14907 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
14908 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
14911 ExtVT = MVT::v64i8;
14915 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14916 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14918 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14919 // i1 was sign extended we can use X86ISD::CVT2MASK.
14920 int NumElems = VT.getVectorNumElements();
14921 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14922 (Subtarget.hasDQI() && (NumElems < 32)))
14923 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
14924 Shuffle, ISD::SETGT);
14926 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14929 /// Helper function that returns true if the shuffle mask should be
14930 /// commuted to improve canonicalization.
14931 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14932 int NumElements = Mask.size();
14934 int NumV1Elements = 0, NumV2Elements = 0;
14938 else if (M < NumElements)
14943 // Commute the shuffle as needed such that more elements come from V1 than
14944 // V2. This allows us to match the shuffle pattern strictly on how many
14945 // elements come from V1 without handling the symmetric cases.
14946 if (NumV2Elements > NumV1Elements)
14949 assert(NumV1Elements > 0 && "No V1 indices");
14951 if (NumV2Elements == 0)
14954 // When the number of V1 and V2 elements are the same, try to minimize the
14955 // number of uses of V2 in the low half of the vector. When that is tied,
14956 // ensure that the sum of indices for V1 is equal to or lower than the sum
14957 // indices for V2. When those are equal, try to ensure that the number of odd
14958 // indices for V1 is lower than the number of odd indices for V2.
14959 if (NumV1Elements == NumV2Elements) {
14960 int LowV1Elements = 0, LowV2Elements = 0;
14961 for (int M : Mask.slice(0, NumElements / 2))
14962 if (M >= NumElements)
14966 if (LowV2Elements > LowV1Elements)
14968 if (LowV2Elements == LowV1Elements) {
14969 int SumV1Indices = 0, SumV2Indices = 0;
14970 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14971 if (Mask[i] >= NumElements)
14973 else if (Mask[i] >= 0)
14975 if (SumV2Indices < SumV1Indices)
14977 if (SumV2Indices == SumV1Indices) {
14978 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14979 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14980 if (Mask[i] >= NumElements)
14981 NumV2OddIndices += i % 2;
14982 else if (Mask[i] >= 0)
14983 NumV1OddIndices += i % 2;
14984 if (NumV2OddIndices < NumV1OddIndices)
14993 /// Top-level lowering for x86 vector shuffles.
14995 /// This handles decomposition, canonicalization, and lowering of all x86
14996 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14997 /// above in helper routines. The canonicalization attempts to widen shuffles
14998 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
14999 /// s.t. only one of the two inputs needs to be tested, etc.
15000 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
15001 SelectionDAG &DAG) {
15002 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
15003 ArrayRef<int> Mask = SVOp->getMask();
15004 SDValue V1 = Op.getOperand(0);
15005 SDValue V2 = Op.getOperand(1);
15006 MVT VT = Op.getSimpleValueType();
15007 int NumElements = VT.getVectorNumElements();
15009 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
15011 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
15012 "Can't lower MMX shuffles");
15014 bool V1IsUndef = V1.isUndef();
15015 bool V2IsUndef = V2.isUndef();
15016 if (V1IsUndef && V2IsUndef)
15017 return DAG.getUNDEF(VT);
15019 // When we create a shuffle node we put the UNDEF node to second operand,
15020 // but in some cases the first operand may be transformed to UNDEF.
15021 // In this case we should just commute the node.
15023 return DAG.getCommutedVectorShuffle(*SVOp);
15025 // Check for non-undef masks pointing at an undef vector and make the masks
15026 // undef as well. This makes it easier to match the shuffle based solely on
15030 if (M >= NumElements) {
15031 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
15032 for (int &M : NewMask)
15033 if (M >= NumElements)
15035 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15038 // Check for illegal shuffle mask element index values.
15039 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
15040 assert(llvm::all_of(Mask,
15041 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
15042 "Out of bounds shuffle index");
15044 // We actually see shuffles that are entirely re-arrangements of a set of
15045 // zero inputs. This mostly happens while decomposing complex shuffles into
15046 // simple ones. Directly lower these as a buildvector of zeros.
15047 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
15048 if (Zeroable.isAllOnesValue())
15049 return getZeroVector(VT, Subtarget, DAG, DL);
15051 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
15053 // Create an alternative mask with info about zeroable elements.
15054 // Here we do not set undef elements as zeroable.
15055 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
15057 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
15058 for (int i = 0; i != NumElements; ++i)
15059 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
15060 ZeroableMask[i] = SM_SentinelZero;
15063 // Try to collapse shuffles into using a vector type with fewer elements but
15064 // wider element types. We cap this to not form integers or floating point
15065 // elements wider than 64 bits, but it might be interesting to form i128
15066 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
15067 SmallVector<int, 16> WidenedMask;
15068 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
15069 canWidenShuffleElements(ZeroableMask, WidenedMask)) {
15070 MVT NewEltVT = VT.isFloatingPoint()
15071 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
15072 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
15073 int NewNumElts = NumElements / 2;
15074 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
15075 // Make sure that the new vector type is legal. For example, v2f64 isn't
15077 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
15079 // Modify the new Mask to take all zeros from the all-zero vector.
15080 // Choose indices that are blend-friendly.
15081 bool UsedZeroVector = false;
15082 assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
15083 "V2's non-undef elements are used?!");
15084 for (int i = 0; i != NewNumElts; ++i)
15085 if (WidenedMask[i] == SM_SentinelZero) {
15086 WidenedMask[i] = i + NewNumElts;
15087 UsedZeroVector = true;
15089 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
15090 // some elements to be undef.
15091 if (UsedZeroVector)
15092 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
15094 V1 = DAG.getBitcast(NewVT, V1);
15095 V2 = DAG.getBitcast(NewVT, V2);
15096 return DAG.getBitcast(
15097 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
15101 // Commute the shuffle if it will improve canonicalization.
15102 if (canonicalizeShuffleMaskWithCommute(Mask))
15103 return DAG.getCommutedVectorShuffle(*SVOp);
15106 lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
15109 // For each vector width, delegate to a specialized lowering routine.
15110 if (VT.is128BitVector())
15111 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15114 if (VT.is256BitVector())
15115 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15118 if (VT.is512BitVector())
15119 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15123 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15126 llvm_unreachable("Unimplemented!");
15129 /// Try to lower a VSELECT instruction to a vector shuffle.
15130 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
15131 const X86Subtarget &Subtarget,
15132 SelectionDAG &DAG) {
15133 SDValue Cond = Op.getOperand(0);
15134 SDValue LHS = Op.getOperand(1);
15135 SDValue RHS = Op.getOperand(2);
15137 MVT VT = Op.getSimpleValueType();
15139 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
15141 auto *CondBV = cast<BuildVectorSDNode>(Cond);
15143 // Only non-legal VSELECTs reach this lowering, convert those into generic
15144 // shuffles and re-use the shuffle lowering path for blends.
15145 SmallVector<int, 32> Mask;
15146 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
15147 SDValue CondElt = CondBV->getOperand(i);
15149 // We can't map undef to undef here. They have different meanings. Treat
15150 // as the same as zero.
15151 if (CondElt.isUndef() || isNullConstant(CondElt))
15155 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
15158 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
15159 // A vselect where all conditions and data are constants can be optimized into
15160 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
15161 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
15162 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
15163 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
15166 // Try to lower this to a blend-style vector shuffle. This can handle all
15167 // constant condition cases.
15168 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
15171 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
15172 // with patterns on the mask registers on AVX-512.
15173 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
15176 // Variable blends are only legal from SSE4.1 onward.
15177 if (!Subtarget.hasSSE41())
15181 MVT VT = Op.getSimpleValueType();
15183 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
15184 // into an i1 condition so that we can use the mask-based 512-bit blend
15186 if (VT.getSizeInBits() == 512) {
15187 SDValue Cond = Op.getOperand(0);
15188 // The vNi1 condition case should be handled above as it can be trivially
15190 assert(Cond.getValueType().getScalarSizeInBits() ==
15191 VT.getScalarSizeInBits() &&
15192 "Should have a size-matched integer condition!");
15193 // Build a mask by testing the condition against zero.
15194 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
15195 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
15196 getZeroVector(VT, Subtarget, DAG, dl),
15198 // Now return a new VSELECT using the mask.
15199 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
15202 // Only some types will be legal on some subtargets. If we can emit a legal
15203 // VSELECT-matching blend, return Op, and but if we need to expand, return
15205 switch (VT.SimpleTy) {
15207 // Most of the vector types have blends past SSE4.1.
15211 // The byte blends for AVX vectors were introduced only in AVX2.
15212 if (Subtarget.hasAVX2())
15218 case MVT::v16i16: {
15219 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
15220 MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
15221 SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
15222 SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
15223 SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
15224 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
15225 return DAG.getBitcast(VT, Select);
15230 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
15231 MVT VT = Op.getSimpleValueType();
15234 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
15237 if (VT.getSizeInBits() == 8) {
15238 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
15239 Op.getOperand(0), Op.getOperand(1));
15240 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15243 if (VT == MVT::f32) {
15244 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
15245 // the result back to FR32 register. It's only worth matching if the
15246 // result has a single use which is a store or a bitcast to i32. And in
15247 // the case of a store, it's not worth it if the index is a constant 0,
15248 // because a MOVSSmr can be used instead, which is smaller and faster.
15249 if (!Op.hasOneUse())
15251 SDNode *User = *Op.getNode()->use_begin();
15252 if ((User->getOpcode() != ISD::STORE ||
15253 isNullConstant(Op.getOperand(1))) &&
15254 (User->getOpcode() != ISD::BITCAST ||
15255 User->getValueType(0) != MVT::i32))
15257 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15258 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
15260 return DAG.getBitcast(MVT::f32, Extract);
15263 if (VT == MVT::i32 || VT == MVT::i64) {
15264 // ExtractPS/pextrq works with constant index.
15265 if (isa<ConstantSDNode>(Op.getOperand(1)))
15272 /// Extract one bit from mask vector, like v16i1 or v8i1.
15273 /// AVX-512 feature.
15274 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
15275 const X86Subtarget &Subtarget) {
15276 SDValue Vec = Op.getOperand(0);
15278 MVT VecVT = Vec.getSimpleValueType();
15279 SDValue Idx = Op.getOperand(1);
15280 MVT EltVT = Op.getSimpleValueType();
15282 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
15283 "Unexpected vector type in ExtractBitFromMaskVector");
15285 // variable index can't be handled in mask registers,
15286 // extend vector to VR512/128
15287 if (!isa<ConstantSDNode>(Idx)) {
15288 unsigned NumElts = VecVT.getVectorNumElements();
15289 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
15290 // than extending to 128/256bit.
15291 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15292 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15293 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
15294 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
15295 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
15298 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15300 // If the kshift instructions of the correct width aren't natively supported
15301 // then we need to promote the vector to the native size to get the correct
15302 // zeroing behavior.
15303 if (VecVT.getVectorNumElements() < 16) {
15304 VecVT = MVT::v16i1;
15305 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
15306 DAG.getUNDEF(VecVT), Vec,
15307 DAG.getIntPtrConstant(0, dl));
15310 // Extracts from element 0 are always allowed.
15312 // Use kshiftr instruction to move to the lower element.
15313 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
15314 DAG.getConstant(IdxVal, dl, MVT::i8));
15317 // Shrink to v16i1 since that's always legal.
15318 if (VecVT.getVectorNumElements() > 16) {
15319 VecVT = MVT::v16i1;
15320 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec,
15321 DAG.getIntPtrConstant(0, dl));
15324 // Convert to a bitcast+aext/trunc.
15325 MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements());
15326 return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT);
15330 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15331 SelectionDAG &DAG) const {
15333 SDValue Vec = Op.getOperand(0);
15334 MVT VecVT = Vec.getSimpleValueType();
15335 SDValue Idx = Op.getOperand(1);
15337 if (VecVT.getVectorElementType() == MVT::i1)
15338 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
15340 if (!isa<ConstantSDNode>(Idx)) {
15341 // Its more profitable to go through memory (1 cycles throughput)
15342 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
15343 // IACA tool was used to get performance estimation
15344 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
15346 // example : extractelement <16 x i8> %a, i32 %i
15348 // Block Throughput: 3.00 Cycles
15349 // Throughput Bottleneck: Port5
15351 // | Num Of | Ports pressure in cycles | |
15352 // | Uops | 0 - DV | 5 | 6 | 7 | |
15353 // ---------------------------------------------
15354 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
15355 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
15356 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
15357 // Total Num Of Uops: 4
15360 // Block Throughput: 1.00 Cycles
15361 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
15363 // | | Ports pressure in cycles | |
15364 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
15365 // ---------------------------------------------------------
15366 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
15367 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
15368 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
15369 // Total Num Of Uops: 4
15374 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15376 // If this is a 256-bit vector result, first extract the 128-bit vector and
15377 // then extract the element from the 128-bit vector.
15378 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
15379 // Get the 128-bit vector.
15380 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
15381 MVT EltVT = VecVT.getVectorElementType();
15383 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
15384 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
15386 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
15387 // this can be done with a mask.
15388 IdxVal &= ElemsPerChunk - 1;
15389 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
15390 DAG.getConstant(IdxVal, dl, MVT::i32));
15393 assert(VecVT.is128BitVector() && "Unexpected vector length");
15395 MVT VT = Op.getSimpleValueType();
15397 if (VT.getSizeInBits() == 16) {
15398 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
15399 // we're going to zero extend the register or fold the store (SSE41 only).
15400 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
15401 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
15402 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
15403 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15404 DAG.getBitcast(MVT::v4i32, Vec), Idx));
15406 // Transform it so it match pextrw which produces a 32-bit result.
15407 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
15408 Op.getOperand(0), Op.getOperand(1));
15409 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15412 if (Subtarget.hasSSE41())
15413 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
15416 // TODO: We only extract a single element from v16i8, we can probably afford
15417 // to be more aggressive here before using the default approach of spilling to
15419 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
15420 // Extract either the lowest i32 or any i16, and extract the sub-byte.
15421 int DWordIdx = IdxVal / 4;
15422 if (DWordIdx == 0) {
15423 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15424 DAG.getBitcast(MVT::v4i32, Vec),
15425 DAG.getIntPtrConstant(DWordIdx, dl));
15426 int ShiftVal = (IdxVal % 4) * 8;
15428 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
15429 DAG.getConstant(ShiftVal, dl, MVT::i8));
15430 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15433 int WordIdx = IdxVal / 2;
15434 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
15435 DAG.getBitcast(MVT::v8i16, Vec),
15436 DAG.getIntPtrConstant(WordIdx, dl));
15437 int ShiftVal = (IdxVal % 2) * 8;
15439 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
15440 DAG.getConstant(ShiftVal, dl, MVT::i8));
15441 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15444 if (VT.getSizeInBits() == 32) {
15448 // SHUFPS the element to the lowest double word, then movss.
15449 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
15450 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15451 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15452 DAG.getIntPtrConstant(0, dl));
15455 if (VT.getSizeInBits() == 64) {
15456 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
15457 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
15458 // to match extract_elt for f64.
15462 // UNPCKHPD the element to the lowest double word, then movsd.
15463 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
15464 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
15465 int Mask[2] = { 1, -1 };
15466 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15467 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15468 DAG.getIntPtrConstant(0, dl));
15474 /// Insert one bit to mask vector, like v16i1 or v8i1.
15475 /// AVX-512 feature.
15476 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
15477 const X86Subtarget &Subtarget) {
15479 SDValue Vec = Op.getOperand(0);
15480 SDValue Elt = Op.getOperand(1);
15481 SDValue Idx = Op.getOperand(2);
15482 MVT VecVT = Vec.getSimpleValueType();
15484 if (!isa<ConstantSDNode>(Idx)) {
15485 // Non constant index. Extend source and destination,
15486 // insert element and then truncate the result.
15487 unsigned NumElts = VecVT.getVectorNumElements();
15488 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15489 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15490 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
15491 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
15492 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
15493 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
15496 // Copy into a k-register, extract to v1i1 and insert_subvector.
15497 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
15499 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
15503 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15504 SelectionDAG &DAG) const {
15505 MVT VT = Op.getSimpleValueType();
15506 MVT EltVT = VT.getVectorElementType();
15507 unsigned NumElts = VT.getVectorNumElements();
15509 if (EltVT == MVT::i1)
15510 return InsertBitToMaskVector(Op, DAG, Subtarget);
15513 SDValue N0 = Op.getOperand(0);
15514 SDValue N1 = Op.getOperand(1);
15515 SDValue N2 = Op.getOperand(2);
15516 if (!isa<ConstantSDNode>(N2))
15518 auto *N2C = cast<ConstantSDNode>(N2);
15519 unsigned IdxVal = N2C->getZExtValue();
15521 bool IsZeroElt = X86::isZeroNode(N1);
15522 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
15524 // If we are inserting a element, see if we can do this more efficiently with
15525 // a blend shuffle with a rematerializable vector than a costly integer
15527 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
15528 16 <= EltVT.getSizeInBits()) {
15529 SmallVector<int, 8> BlendMask;
15530 for (unsigned i = 0; i != NumElts; ++i)
15531 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
15532 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
15533 : getOnesVector(VT, DAG, dl);
15534 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
15537 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
15538 // into that, and then insert the subvector back into the result.
15539 if (VT.is256BitVector() || VT.is512BitVector()) {
15540 // With a 256-bit vector, we can insert into the zero element efficiently
15541 // using a blend if we have AVX or AVX2 and the right data type.
15542 if (VT.is256BitVector() && IdxVal == 0) {
15543 // TODO: It is worthwhile to cast integer to floating point and back
15544 // and incur a domain crossing penalty if that's what we'll end up
15545 // doing anyway after extracting to a 128-bit vector.
15546 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
15547 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
15548 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
15549 N2 = DAG.getIntPtrConstant(1, dl);
15550 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
15554 // Get the desired 128-bit vector chunk.
15555 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
15557 // Insert the element into the desired chunk.
15558 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
15559 assert(isPowerOf2_32(NumEltsIn128));
15560 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
15561 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
15563 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
15564 DAG.getConstant(IdxIn128, dl, MVT::i32));
15566 // Insert the changed part back into the bigger vector
15567 return insert128BitVector(N0, V, IdxVal, DAG, dl);
15569 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
15571 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
15572 // argument. SSE41 required for pinsrb.
15573 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
15575 if (VT == MVT::v8i16) {
15576 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
15577 Opc = X86ISD::PINSRW;
15579 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
15580 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
15581 Opc = X86ISD::PINSRB;
15584 if (N1.getValueType() != MVT::i32)
15585 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
15586 if (N2.getValueType() != MVT::i32)
15587 N2 = DAG.getIntPtrConstant(IdxVal, dl);
15588 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
15591 if (Subtarget.hasSSE41()) {
15592 if (EltVT == MVT::f32) {
15593 // Bits [7:6] of the constant are the source select. This will always be
15594 // zero here. The DAG Combiner may combine an extract_elt index into
15595 // these bits. For example (insert (extract, 3), 2) could be matched by
15596 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
15597 // Bits [5:4] of the constant are the destination select. This is the
15598 // value of the incoming immediate.
15599 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
15600 // combine either bitwise AND or insert of float 0.0 to set these bits.
15602 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
15603 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
15604 // If this is an insertion of 32-bits into the low 32-bits of
15605 // a vector, we prefer to generate a blend with immediate rather
15606 // than an insertps. Blends are simpler operations in hardware and so
15607 // will always have equal or better performance than insertps.
15608 // But if optimizing for size and there's a load folding opportunity,
15609 // generate insertps because blendps does not have a 32-bit memory
15611 N2 = DAG.getIntPtrConstant(1, dl);
15612 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15613 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
15615 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
15616 // Create this as a scalar to vector..
15617 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15618 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
15621 // PINSR* works with constant index.
15622 if (EltVT == MVT::i32 || EltVT == MVT::i64)
15629 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
15630 SelectionDAG &DAG) {
15632 MVT OpVT = Op.getSimpleValueType();
15634 // It's always cheaper to replace a xor+movd with xorps and simplifies further
15636 if (X86::isZeroNode(Op.getOperand(0)))
15637 return getZeroVector(OpVT, Subtarget, DAG, dl);
15639 // If this is a 256-bit vector result, first insert into a 128-bit
15640 // vector and then insert into the 256-bit vector.
15641 if (!OpVT.is128BitVector()) {
15642 // Insert into a 128-bit vector.
15643 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
15644 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
15645 OpVT.getVectorNumElements() / SizeFactor);
15647 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
15649 // Insert the 128-bit vector.
15650 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
15652 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15654 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15655 if (OpVT == MVT::v4i32)
15658 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15659 return DAG.getBitcast(
15660 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15663 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15664 // simple superregister reference or explicit instructions to insert
15665 // the upper bits of a vector.
15666 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15667 SelectionDAG &DAG) {
15668 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15670 return insert1BitVector(Op, DAG, Subtarget);
15673 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15674 SelectionDAG &DAG) {
15675 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15676 "Only vXi1 extract_subvectors need custom lowering");
15679 SDValue Vec = Op.getOperand(0);
15680 SDValue Idx = Op.getOperand(1);
15682 if (!isa<ConstantSDNode>(Idx))
15685 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15686 if (IdxVal == 0) // the operation is legal
15689 MVT VecVT = Vec.getSimpleValueType();
15690 unsigned NumElems = VecVT.getVectorNumElements();
15692 // Extend to natively supported kshift.
15693 MVT WideVecVT = VecVT;
15694 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
15695 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
15696 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
15697 DAG.getUNDEF(WideVecVT), Vec,
15698 DAG.getIntPtrConstant(0, dl));
15701 // Shift to the LSB.
15702 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
15703 DAG.getConstant(IdxVal, dl, MVT::i8));
15705 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
15706 DAG.getIntPtrConstant(0, dl));
15709 // Returns the appropriate wrapper opcode for a global reference.
15710 unsigned X86TargetLowering::getGlobalWrapperKind(
15711 const GlobalValue *GV, const unsigned char OpFlags) const {
15712 // References to absolute symbols are never PC-relative.
15713 if (GV && GV->isAbsoluteSymbolRef())
15714 return X86ISD::Wrapper;
15716 CodeModel::Model M = getTargetMachine().getCodeModel();
15717 if (Subtarget.isPICStyleRIPRel() &&
15718 (M == CodeModel::Small || M == CodeModel::Kernel))
15719 return X86ISD::WrapperRIP;
15721 // GOTPCREL references must always use RIP.
15722 if (OpFlags == X86II::MO_GOTPCREL)
15723 return X86ISD::WrapperRIP;
15725 return X86ISD::Wrapper;
15728 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15729 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15730 // one of the above mentioned nodes. It has to be wrapped because otherwise
15731 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15732 // be used to form addressing mode. These wrapped nodes will be selected
15735 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15736 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15738 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15739 // global base reg.
15740 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15742 auto PtrVT = getPointerTy(DAG.getDataLayout());
15743 SDValue Result = DAG.getTargetConstantPool(
15744 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15746 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15747 // With PIC, the address is actually $g + Offset.
15750 DAG.getNode(ISD::ADD, DL, PtrVT,
15751 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15757 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15758 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15760 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15761 // global base reg.
15762 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15764 auto PtrVT = getPointerTy(DAG.getDataLayout());
15765 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15767 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15769 // With PIC, the address is actually $g + Offset.
15772 DAG.getNode(ISD::ADD, DL, PtrVT,
15773 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15779 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15780 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15782 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15783 // global base reg.
15784 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15785 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15787 auto PtrVT = getPointerTy(DAG.getDataLayout());
15788 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15791 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15793 // With PIC, the address is actually $g + Offset.
15794 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15796 DAG.getNode(ISD::ADD, DL, PtrVT,
15797 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15800 // For symbols that require a load from a stub to get the address, emit the
15802 if (isGlobalStubReference(OpFlag))
15803 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15804 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15810 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15811 // Create the TargetBlockAddressAddress node.
15812 unsigned char OpFlags =
15813 Subtarget.classifyBlockAddressReference();
15814 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15815 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15817 auto PtrVT = getPointerTy(DAG.getDataLayout());
15818 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15819 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15821 // With PIC, the address is actually $g + Offset.
15822 if (isGlobalRelativeToPICBase(OpFlags)) {
15823 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15824 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15830 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15831 const SDLoc &dl, int64_t Offset,
15832 SelectionDAG &DAG) const {
15833 // Create the TargetGlobalAddress node, folding in the constant
15834 // offset if it is legal.
15835 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15836 CodeModel::Model M = DAG.getTarget().getCodeModel();
15837 auto PtrVT = getPointerTy(DAG.getDataLayout());
15839 if (OpFlags == X86II::MO_NO_FLAG &&
15840 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15841 // A direct static reference to a global.
15842 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15845 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15848 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
15850 // With PIC, the address is actually $g + Offset.
15851 if (isGlobalRelativeToPICBase(OpFlags)) {
15852 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15853 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15856 // For globals that require a load from a stub to get the address, emit the
15858 if (isGlobalStubReference(OpFlags))
15859 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15860 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15862 // If there was a non-zero offset that we didn't fold, create an explicit
15863 // addition for it.
15865 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15866 DAG.getConstant(Offset, dl, PtrVT));
15872 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15873 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15874 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15875 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15879 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15880 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15881 unsigned char OperandFlags, bool LocalDynamic = false) {
15882 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15883 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15885 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15886 GA->getValueType(0),
15890 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15894 SDValue Ops[] = { Chain, TGA, *InFlag };
15895 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15897 SDValue Ops[] = { Chain, TGA };
15898 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15901 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15902 MFI.setAdjustsStack(true);
15903 MFI.setHasCalls(true);
15905 SDValue Flag = Chain.getValue(1);
15906 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15909 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15911 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15914 SDLoc dl(GA); // ? function entry point might be better
15915 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15916 DAG.getNode(X86ISD::GlobalBaseReg,
15917 SDLoc(), PtrVT), InFlag);
15918 InFlag = Chain.getValue(1);
15920 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15923 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15925 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15927 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15928 X86::RAX, X86II::MO_TLSGD);
15931 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15937 // Get the start address of the TLS block for this module.
15938 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15939 .getInfo<X86MachineFunctionInfo>();
15940 MFI->incNumLocalDynamicTLSAccesses();
15944 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15945 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15948 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15949 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15950 InFlag = Chain.getValue(1);
15951 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15952 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15955 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15959 unsigned char OperandFlags = X86II::MO_DTPOFF;
15960 unsigned WrapperKind = X86ISD::Wrapper;
15961 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15962 GA->getValueType(0),
15963 GA->getOffset(), OperandFlags);
15964 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15966 // Add x@dtpoff with the base.
15967 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15970 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15971 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15972 const EVT PtrVT, TLSModel::Model model,
15973 bool is64Bit, bool isPIC) {
15976 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15977 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15978 is64Bit ? 257 : 256));
15980 SDValue ThreadPointer =
15981 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15982 MachinePointerInfo(Ptr));
15984 unsigned char OperandFlags = 0;
15985 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15987 unsigned WrapperKind = X86ISD::Wrapper;
15988 if (model == TLSModel::LocalExec) {
15989 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15990 } else if (model == TLSModel::InitialExec) {
15992 OperandFlags = X86II::MO_GOTTPOFF;
15993 WrapperKind = X86ISD::WrapperRIP;
15995 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15998 llvm_unreachable("Unexpected model");
16001 // emit "addl x@ntpoff,%eax" (local exec)
16002 // or "addl x@indntpoff,%eax" (initial exec)
16003 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
16005 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
16006 GA->getOffset(), OperandFlags);
16007 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
16009 if (model == TLSModel::InitialExec) {
16010 if (isPIC && !is64Bit) {
16011 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
16012 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
16016 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
16017 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
16020 // The address of the thread local variable is the add of the thread
16021 // pointer with the offset of the variable.
16022 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
16026 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
16028 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
16030 if (DAG.getTarget().useEmulatedTLS())
16031 return LowerToTLSEmulatedModel(GA, DAG);
16033 const GlobalValue *GV = GA->getGlobal();
16034 auto PtrVT = getPointerTy(DAG.getDataLayout());
16035 bool PositionIndependent = isPositionIndependent();
16037 if (Subtarget.isTargetELF()) {
16038 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
16040 case TLSModel::GeneralDynamic:
16041 if (Subtarget.is64Bit())
16042 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
16043 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
16044 case TLSModel::LocalDynamic:
16045 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
16046 Subtarget.is64Bit());
16047 case TLSModel::InitialExec:
16048 case TLSModel::LocalExec:
16049 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
16050 PositionIndependent);
16052 llvm_unreachable("Unknown TLS model.");
16055 if (Subtarget.isTargetDarwin()) {
16056 // Darwin only has one model of TLS. Lower to that.
16057 unsigned char OpFlag = 0;
16058 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
16059 X86ISD::WrapperRIP : X86ISD::Wrapper;
16061 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
16062 // global base reg.
16063 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
16065 OpFlag = X86II::MO_TLVP_PIC_BASE;
16067 OpFlag = X86II::MO_TLVP;
16069 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
16070 GA->getValueType(0),
16071 GA->getOffset(), OpFlag);
16072 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
16074 // With PIC32, the address is actually $g + Offset.
16076 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
16077 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
16080 // Lowering the machine isd will make sure everything is in the right
16082 SDValue Chain = DAG.getEntryNode();
16083 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16084 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16085 SDValue Args[] = { Chain, Offset };
16086 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
16087 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
16088 DAG.getIntPtrConstant(0, DL, true),
16089 Chain.getValue(1), DL);
16091 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
16092 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
16093 MFI.setAdjustsStack(true);
16095 // And our return value (tls address) is in the standard call return value
16097 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
16098 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
16101 if (Subtarget.isTargetKnownWindowsMSVC() ||
16102 Subtarget.isTargetWindowsItanium() ||
16103 Subtarget.isTargetWindowsGNU()) {
16104 // Just use the implicit TLS architecture
16105 // Need to generate something similar to:
16106 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
16108 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
16109 // mov rcx, qword [rdx+rcx*8]
16110 // mov eax, .tls$:tlsvar
16111 // [rax+rcx] contains the address
16112 // Windows 64bit: gs:0x58
16113 // Windows 32bit: fs:__tls_array
16116 SDValue Chain = DAG.getEntryNode();
16118 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
16119 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
16120 // use its literal value of 0x2C.
16121 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
16122 ? Type::getInt8PtrTy(*DAG.getContext(),
16124 : Type::getInt32PtrTy(*DAG.getContext(),
16127 SDValue TlsArray = Subtarget.is64Bit()
16128 ? DAG.getIntPtrConstant(0x58, dl)
16129 : (Subtarget.isTargetWindowsGNU()
16130 ? DAG.getIntPtrConstant(0x2C, dl)
16131 : DAG.getExternalSymbol("_tls_array", PtrVT));
16133 SDValue ThreadPointer =
16134 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
16137 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
16138 res = ThreadPointer;
16140 // Load the _tls_index variable
16141 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
16142 if (Subtarget.is64Bit())
16143 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
16144 MachinePointerInfo(), MVT::i32);
16146 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
16148 auto &DL = DAG.getDataLayout();
16150 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
16151 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
16153 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
16156 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
16158 // Get the offset of start of .tls section
16159 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
16160 GA->getValueType(0),
16161 GA->getOffset(), X86II::MO_SECREL);
16162 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
16164 // The address of the thread local variable is the add of the thread
16165 // pointer with the offset of the variable.
16166 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
16169 llvm_unreachable("TLS not implemented for this target.");
16172 /// Lower SRA_PARTS and friends, which return two i32 values
16173 /// and take a 2 x i32 value to shift plus a shift amount.
16174 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
16175 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
16176 MVT VT = Op.getSimpleValueType();
16177 unsigned VTBits = VT.getSizeInBits();
16179 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
16180 SDValue ShOpLo = Op.getOperand(0);
16181 SDValue ShOpHi = Op.getOperand(1);
16182 SDValue ShAmt = Op.getOperand(2);
16183 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
16184 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
16186 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16187 DAG.getConstant(VTBits - 1, dl, MVT::i8));
16188 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
16189 DAG.getConstant(VTBits - 1, dl, MVT::i8))
16190 : DAG.getConstant(0, dl, VT);
16192 SDValue Tmp2, Tmp3;
16193 if (Op.getOpcode() == ISD::SHL_PARTS) {
16194 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
16195 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
16197 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
16198 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
16201 // If the shift amount is larger or equal than the width of a part we can't
16202 // rely on the results of shld/shrd. Insert a test and select the appropriate
16203 // values for large shift amounts.
16204 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16205 DAG.getConstant(VTBits, dl, MVT::i8));
16206 SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
16207 DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
16210 if (Op.getOpcode() == ISD::SHL_PARTS) {
16211 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
16212 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
16214 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
16215 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
16218 return DAG.getMergeValues({ Lo, Hi }, dl);
16221 // Try to use a packed vector operation to handle i64 on 32-bit targets when
16222 // AVX512DQ is enabled.
16223 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
16224 const X86Subtarget &Subtarget) {
16225 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
16226 Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
16227 SDValue Src = Op.getOperand(0);
16228 MVT SrcVT = Src.getSimpleValueType();
16229 MVT VT = Op.getSimpleValueType();
16231 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
16232 (VT != MVT::f32 && VT != MVT::f64))
16235 // Pack the i64 into a vector, do the operation and extract.
16237 // Using 256-bit to ensure result is 128-bits for f32 case.
16238 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
16239 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
16240 MVT VecVT = MVT::getVectorVT(VT, NumElts);
16243 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
16244 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
16245 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
16246 DAG.getIntPtrConstant(0, dl));
16249 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
16250 SelectionDAG &DAG) const {
16251 SDValue Src = Op.getOperand(0);
16252 MVT SrcVT = Src.getSimpleValueType();
16253 MVT VT = Op.getSimpleValueType();
16256 if (SrcVT.isVector()) {
16257 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
16258 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
16259 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
16260 DAG.getUNDEF(SrcVT)));
16265 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
16266 "Unknown SINT_TO_FP to lower!");
16268 // These are really Legal; return the operand so the caller accepts it as
16270 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
16272 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) {
16276 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16279 SDValue ValueToStore = Op.getOperand(0);
16280 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
16281 !Subtarget.is64Bit())
16282 // Bitcasting to f64 here allows us to do a single 64-bit store from
16283 // an SSE register, avoiding the store forwarding penalty that would come
16284 // with two 32-bit stores.
16285 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16287 unsigned Size = SrcVT.getSizeInBits()/8;
16288 MachineFunction &MF = DAG.getMachineFunction();
16289 auto PtrVT = getPointerTy(MF.getDataLayout());
16290 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
16291 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16292 SDValue Chain = DAG.getStore(
16293 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16294 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16295 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
16298 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
16300 SelectionDAG &DAG) const {
16304 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
16306 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
16308 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
16310 unsigned ByteSize = SrcVT.getSizeInBits()/8;
16312 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
16313 MachineMemOperand *MMO;
16315 int SSFI = FI->getIndex();
16316 MMO = DAG.getMachineFunction().getMachineMemOperand(
16317 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16318 MachineMemOperand::MOLoad, ByteSize, ByteSize);
16320 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
16321 StackSlot = StackSlot.getOperand(1);
16323 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
16324 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
16326 Tys, Ops, SrcVT, MMO);
16329 Chain = Result.getValue(1);
16330 SDValue InFlag = Result.getValue(2);
16332 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
16333 // shouldn't be necessary except that RFP cannot be live across
16334 // multiple blocks. When stackifier is fixed, they can be uncoupled.
16335 MachineFunction &MF = DAG.getMachineFunction();
16336 unsigned SSFISize = Op.getValueSizeInBits()/8;
16337 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
16338 auto PtrVT = getPointerTy(MF.getDataLayout());
16339 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16340 Tys = DAG.getVTList(MVT::Other);
16342 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
16344 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16345 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16346 MachineMemOperand::MOStore, SSFISize, SSFISize);
16348 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
16349 Ops, Op.getValueType(), MMO);
16350 Result = DAG.getLoad(
16351 Op.getValueType(), DL, Chain, StackSlot,
16352 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16358 /// 64-bit unsigned integer to double expansion.
16359 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
16360 const X86Subtarget &Subtarget) {
16361 // This algorithm is not obvious. Here it is what we're trying to output:
16364 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
16365 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
16367 haddpd %xmm0, %xmm0
16369 pshufd $0x4e, %xmm0, %xmm1
16375 LLVMContext *Context = DAG.getContext();
16377 // Build some magic constants.
16378 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
16379 Constant *C0 = ConstantDataVector::get(*Context, CV0);
16380 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
16381 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
16383 SmallVector<Constant*,2> CV1;
16385 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16386 APInt(64, 0x4330000000000000ULL))));
16388 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16389 APInt(64, 0x4530000000000000ULL))));
16390 Constant *C1 = ConstantVector::get(CV1);
16391 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
16393 // Load the 64-bit value into an XMM register.
16394 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
16397 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
16398 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16399 /* Alignment = */ 16);
16401 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
16404 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
16405 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16406 /* Alignment = */ 16);
16407 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
16408 // TODO: Are there any fast-math-flags to propagate here?
16409 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
16412 if (Subtarget.hasSSE3()) {
16413 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
16414 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
16416 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
16417 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
16418 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
16419 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
16422 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
16423 DAG.getIntPtrConstant(0, dl));
16426 /// 32-bit unsigned integer to float expansion.
16427 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
16428 const X86Subtarget &Subtarget) {
16430 // FP constant to bias correct the final result.
16431 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
16434 // Load the 32-bit value into an XMM register.
16435 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
16438 // Zero out the upper parts of the register.
16439 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
16441 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16442 DAG.getBitcast(MVT::v2f64, Load),
16443 DAG.getIntPtrConstant(0, dl));
16445 // Or the load with the bias.
16446 SDValue Or = DAG.getNode(
16447 ISD::OR, dl, MVT::v2i64,
16448 DAG.getBitcast(MVT::v2i64,
16449 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
16450 DAG.getBitcast(MVT::v2i64,
16451 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
16453 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16454 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
16456 // Subtract the bias.
16457 // TODO: Are there any fast-math-flags to propagate here?
16458 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
16460 // Handle final rounding.
16461 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
16464 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
16465 const X86Subtarget &Subtarget,
16467 if (Op.getSimpleValueType() != MVT::v2f64)
16470 SDValue N0 = Op.getOperand(0);
16471 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
16473 // Legalize to v4i32 type.
16474 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
16475 DAG.getUNDEF(MVT::v2i32));
16477 if (Subtarget.hasAVX512())
16478 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
16480 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
16481 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
16482 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
16483 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
16485 // Two to the power of half-word-size.
16486 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
16488 // Clear upper part of LO, lower HI.
16489 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
16490 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
16492 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
16493 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
16494 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
16496 // Add the two halves.
16497 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
16500 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
16501 const X86Subtarget &Subtarget) {
16502 // The algorithm is the following:
16503 // #ifdef __SSE4_1__
16504 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16505 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16506 // (uint4) 0x53000000, 0xaa);
16508 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16509 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16511 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16512 // return (float4) lo + fhi;
16514 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
16515 // reassociate the two FADDs, and if we do that, the algorithm fails
16516 // spectacularly (PR24512).
16517 // FIXME: If we ever have some kind of Machine FMF, this should be marked
16518 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
16519 // there's also the MachineCombiner reassociations happening on Machine IR.
16520 if (DAG.getTarget().Options.UnsafeFPMath)
16524 SDValue V = Op->getOperand(0);
16525 MVT VecIntVT = V.getSimpleValueType();
16526 bool Is128 = VecIntVT == MVT::v4i32;
16527 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
16528 // If we convert to something else than the supported type, e.g., to v4f64,
16530 if (VecFloatVT != Op->getSimpleValueType(0))
16533 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
16534 "Unsupported custom type");
16536 // In the #idef/#else code, we have in common:
16537 // - The vector of constants:
16543 // Create the splat vector for 0x4b000000.
16544 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
16545 // Create the splat vector for 0x53000000.
16546 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
16548 // Create the right shift.
16549 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
16550 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
16553 if (Subtarget.hasSSE41()) {
16554 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
16555 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16556 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
16557 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
16558 // Low will be bitcasted right away, so do not bother bitcasting back to its
16560 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
16561 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16562 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16563 // (uint4) 0x53000000, 0xaa);
16564 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
16565 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
16566 // High will be bitcasted right away, so do not bother bitcasting back to
16567 // its original type.
16568 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
16569 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16571 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
16572 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16573 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
16574 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
16576 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16577 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
16580 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
16581 SDValue VecCstFAdd = DAG.getConstantFP(
16582 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
16584 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16585 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
16586 // TODO: Are there any fast-math-flags to propagate here?
16588 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
16589 // return (float4) lo + fhi;
16590 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
16591 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
16594 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
16595 const X86Subtarget &Subtarget) {
16596 SDValue N0 = Op.getOperand(0);
16597 MVT SrcVT = N0.getSimpleValueType();
16600 switch (SrcVT.SimpleTy) {
16602 llvm_unreachable("Custom UINT_TO_FP is not supported!");
16604 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
16607 assert(!Subtarget.hasAVX512());
16608 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
16612 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
16613 SelectionDAG &DAG) const {
16614 SDValue N0 = Op.getOperand(0);
16616 auto PtrVT = getPointerTy(DAG.getDataLayout());
16618 if (Op.getSimpleValueType().isVector())
16619 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
16621 MVT SrcVT = N0.getSimpleValueType();
16622 MVT DstVT = Op.getSimpleValueType();
16624 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
16625 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
16626 // Conversions from unsigned i32 to f32/f64 are legal,
16627 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
16631 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16634 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
16635 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
16636 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
16637 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
16638 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
16641 // Make a 64-bit buffer, and use it to build an FILD.
16642 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
16643 if (SrcVT == MVT::i32) {
16644 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
16645 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
16646 StackSlot, MachinePointerInfo());
16647 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
16648 OffsetSlot, MachinePointerInfo());
16649 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
16653 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
16654 SDValue ValueToStore = Op.getOperand(0);
16655 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
16656 // Bitcasting to f64 here allows us to do a single 64-bit store from
16657 // an SSE register, avoiding the store forwarding penalty that would come
16658 // with two 32-bit stores.
16659 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16660 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16661 MachinePointerInfo());
16662 // For i64 source, we need to add the appropriate power of 2 if the input
16663 // was negative. This is the same as the optimization in
16664 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
16665 // we must be careful to do the computation in x87 extended precision, not
16666 // in SSE. (The generic code can't know it's OK to do this, or how to.)
16667 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
16668 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16669 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16670 MachineMemOperand::MOLoad, 8, 8);
16672 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
16673 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
16674 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
16677 APInt FF(32, 0x5F800000ULL);
16679 // Check whether the sign bit is set.
16680 SDValue SignSet = DAG.getSetCC(
16681 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
16682 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
16684 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
16685 SDValue FudgePtr = DAG.getConstantPool(
16686 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
16688 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
16689 SDValue Zero = DAG.getIntPtrConstant(0, dl);
16690 SDValue Four = DAG.getIntPtrConstant(4, dl);
16691 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
16692 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
16694 // Load the value out, extending it from f32 to f80.
16695 // FIXME: Avoid the extend by constructing the right constant pool?
16696 SDValue Fudge = DAG.getExtLoad(
16697 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16698 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16699 /* Alignment = */ 4);
16700 // Extend everything to 80 bits to force it to be done on x87.
16701 // TODO: Are there any fast-math-flags to propagate here?
16702 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16703 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16704 DAG.getIntPtrConstant(0, dl));
16707 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16708 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16709 // just return an <SDValue(), SDValue()> pair.
16710 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16711 // to i16, i32 or i64, and we lower it to a legal sequence.
16712 // If lowered to the final integer result we return a <result, SDValue()> pair.
16713 // Otherwise we lower it to a sequence ending with a FIST, return a
16714 // <FIST, StackSlot> pair, and the caller is responsible for loading
16715 // the final integer result from StackSlot.
16716 std::pair<SDValue,SDValue>
16717 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16718 bool IsSigned, bool IsReplace) const {
16721 EVT DstTy = Op.getValueType();
16722 EVT TheVT = Op.getOperand(0).getValueType();
16723 auto PtrVT = getPointerTy(DAG.getDataLayout());
16725 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16726 // f16 must be promoted before using the lowering in this routine.
16727 // fp128 does not use this lowering.
16728 return std::make_pair(SDValue(), SDValue());
16731 // If using FIST to compute an unsigned i64, we'll need some fixup
16732 // to handle values above the maximum signed i64. A FIST is always
16733 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16734 bool UnsignedFixup = !IsSigned &&
16735 DstTy == MVT::i64 &&
16736 (!Subtarget.is64Bit() ||
16737 !isScalarFPTypeInSSEReg(TheVT));
16739 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16740 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16741 // The low 32 bits of the fist result will have the correct uint32 result.
16742 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16746 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16747 DstTy.getSimpleVT() >= MVT::i16 &&
16748 "Unknown FP_TO_INT to lower!");
16750 // These are really Legal.
16751 if (DstTy == MVT::i32 &&
16752 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16753 return std::make_pair(SDValue(), SDValue());
16754 if (Subtarget.is64Bit() &&
16755 DstTy == MVT::i64 &&
16756 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16757 return std::make_pair(SDValue(), SDValue());
16759 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16761 MachineFunction &MF = DAG.getMachineFunction();
16762 unsigned MemSize = DstTy.getSizeInBits()/8;
16763 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16764 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16767 switch (DstTy.getSimpleVT().SimpleTy) {
16768 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16769 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16770 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16771 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16774 SDValue Chain = DAG.getEntryNode();
16775 SDValue Value = Op.getOperand(0);
16776 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16778 if (UnsignedFixup) {
16780 // Conversion to unsigned i64 is implemented with a select,
16781 // depending on whether the source value fits in the range
16782 // of a signed i64. Let Thresh be the FP equivalent of
16783 // 0x8000000000000000ULL.
16785 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16786 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16787 // Fist-to-mem64 FistSrc
16788 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16789 // to XOR'ing the high 32 bits with Adjust.
16791 // Being a power of 2, Thresh is exactly representable in all FP formats.
16792 // For X87 we'd like to use the smallest FP type for this constant, but
16793 // for DAG type consistency we have to match the FP operand type.
16795 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16796 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16797 bool LosesInfo = false;
16798 if (TheVT == MVT::f64)
16799 // The rounding mode is irrelevant as the conversion should be exact.
16800 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16802 else if (TheVT == MVT::f80)
16803 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16804 APFloat::rmNearestTiesToEven, &LosesInfo);
16806 assert(Status == APFloat::opOK && !LosesInfo &&
16807 "FP conversion should have been exact");
16809 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16811 SDValue Cmp = DAG.getSetCC(DL,
16812 getSetCCResultType(DAG.getDataLayout(),
16813 *DAG.getContext(), TheVT),
16814 Value, ThreshVal, ISD::SETLT);
16815 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16816 DAG.getConstant(0, DL, MVT::i32),
16817 DAG.getConstant(0x80000000, DL, MVT::i32));
16818 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16819 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16820 *DAG.getContext(), TheVT),
16821 Value, ThreshVal, ISD::SETLT);
16822 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16825 // FIXME This causes a redundant load/store if the SSE-class value is already
16826 // in memory, such as if it is on the callstack.
16827 if (isScalarFPTypeInSSEReg(TheVT)) {
16828 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16829 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16830 MachinePointerInfo::getFixedStack(MF, SSFI));
16831 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16833 Chain, StackSlot, DAG.getValueType(TheVT)
16836 MachineMemOperand *MMO =
16837 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16838 MachineMemOperand::MOLoad, MemSize, MemSize);
16839 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16840 Chain = Value.getValue(1);
16841 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16842 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16845 MachineMemOperand *MMO =
16846 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16847 MachineMemOperand::MOStore, MemSize, MemSize);
16849 if (UnsignedFixup) {
16851 // Insert the FIST, load its result as two i32's,
16852 // and XOR the high i32 with Adjust.
16854 SDValue FistOps[] = { Chain, Value, StackSlot };
16855 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16856 FistOps, DstTy, MMO);
16859 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16860 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16863 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16864 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16866 if (Subtarget.is64Bit()) {
16867 // Join High32 and Low32 into a 64-bit result.
16868 // (High32 << 32) | Low32
16869 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16870 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16871 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16872 DAG.getConstant(32, DL, MVT::i8));
16873 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16874 return std::make_pair(Result, SDValue());
16877 SDValue ResultOps[] = { Low32, High32 };
16879 SDValue pair = IsReplace
16880 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16881 : DAG.getMergeValues(ResultOps, DL);
16882 return std::make_pair(pair, SDValue());
16884 // Build the FP_TO_INT*_IN_MEM
16885 SDValue Ops[] = { Chain, Value, StackSlot };
16886 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16888 return std::make_pair(FIST, StackSlot);
16892 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16893 const X86Subtarget &Subtarget) {
16894 MVT VT = Op->getSimpleValueType(0);
16895 SDValue In = Op->getOperand(0);
16896 MVT InVT = In.getSimpleValueType();
16899 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
16900 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
16901 "Expected same number of elements");
16902 assert((VT.getVectorElementType() == MVT::i16 ||
16903 VT.getVectorElementType() == MVT::i32 ||
16904 VT.getVectorElementType() == MVT::i64) &&
16905 "Unexpected element type");
16906 assert((InVT.getVectorElementType() == MVT::i8 ||
16907 InVT.getVectorElementType() == MVT::i16 ||
16908 InVT.getVectorElementType() == MVT::i32) &&
16909 "Unexpected element type");
16911 if (Subtarget.hasInt256())
16912 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16914 // Optimize vectors in AVX mode:
16917 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16918 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16919 // Concat upper and lower parts.
16922 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16923 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16924 // Concat upper and lower parts.
16927 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16928 SDValue Undef = DAG.getUNDEF(InVT);
16929 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16930 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16931 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16933 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16934 VT.getVectorNumElements()/2);
16936 OpLo = DAG.getBitcast(HVT, OpLo);
16937 OpHi = DAG.getBitcast(HVT, OpHi);
16939 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16942 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
16943 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
16944 const SDLoc &dl, SelectionDAG &DAG) {
16945 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
16946 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16947 DAG.getIntPtrConstant(0, dl));
16948 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16949 DAG.getIntPtrConstant(8, dl));
16950 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
16951 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
16952 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
16953 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
16956 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16957 const X86Subtarget &Subtarget,
16958 SelectionDAG &DAG) {
16959 MVT VT = Op->getSimpleValueType(0);
16960 SDValue In = Op->getOperand(0);
16961 MVT InVT = In.getSimpleValueType();
16962 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16964 unsigned NumElts = VT.getVectorNumElements();
16966 // For all vectors, but vXi8 we can just emit a sign_extend a shift. This
16967 // avoids a constant pool load.
16968 if (VT.getVectorElementType() != MVT::i8) {
16969 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
16970 return DAG.getNode(ISD::SRL, DL, VT, Extend,
16971 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
16974 // Extend VT if BWI is not supported.
16976 if (!Subtarget.hasBWI()) {
16977 // If v16i32 is to be avoided, we'll need to split and concatenate.
16978 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
16979 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
16981 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16984 // Widen to 512-bits if VLX is not supported.
16985 MVT WideVT = ExtVT;
16986 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16987 NumElts *= 512 / ExtVT.getSizeInBits();
16988 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16989 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16990 In, DAG.getIntPtrConstant(0, DL));
16991 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16995 SDValue One = DAG.getConstant(1, DL, WideVT);
16996 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
16998 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
17000 // Truncate if we had to extend above.
17002 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
17003 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
17006 // Extract back to 128/256-bit if we widened.
17008 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
17009 DAG.getIntPtrConstant(0, DL));
17011 return SelectedVal;
17014 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17015 SelectionDAG &DAG) {
17016 SDValue In = Op.getOperand(0);
17017 MVT SVT = In.getSimpleValueType();
17019 if (SVT.getVectorElementType() == MVT::i1)
17020 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
17022 assert(Subtarget.hasAVX() && "Expected AVX support");
17023 return LowerAVXExtend(Op, DAG, Subtarget);
17026 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
17027 /// It makes use of the fact that vectors with enough leading sign/zero bits
17028 /// prevent the PACKSS/PACKUS from saturating the results.
17029 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
17030 /// within each 128-bit lane.
17031 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
17032 const SDLoc &DL, SelectionDAG &DAG,
17033 const X86Subtarget &Subtarget) {
17034 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
17035 "Unexpected PACK opcode");
17037 // Requires SSE2 but AVX512 has fast vector truncate.
17038 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
17041 EVT SrcVT = In.getValueType();
17043 // No truncation required, we might get here due to recursive calls.
17044 if (SrcVT == DstVT)
17047 // We only support vector truncation to 64bits or greater from a
17048 // 128bits or greater source.
17049 unsigned DstSizeInBits = DstVT.getSizeInBits();
17050 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
17051 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
17054 unsigned NumElems = SrcVT.getVectorNumElements();
17055 if (!isPowerOf2_32(NumElems))
17058 LLVMContext &Ctx = *DAG.getContext();
17059 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
17060 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
17062 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
17064 // Pack to the largest type possible:
17065 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
17066 EVT InVT = MVT::i16, OutVT = MVT::i8;
17067 if (SrcVT.getScalarSizeInBits() > 16 &&
17068 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
17073 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
17074 if (SrcVT.is128BitVector()) {
17075 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
17076 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
17077 In = DAG.getBitcast(InVT, In);
17078 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
17079 Res = extractSubVector(Res, 0, DAG, DL, 64);
17080 return DAG.getBitcast(DstVT, Res);
17083 // Extract lower/upper subvectors.
17084 unsigned NumSubElts = NumElems / 2;
17085 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17086 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17088 unsigned SubSizeInBits = SrcSizeInBits / 2;
17089 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
17090 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
17092 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
17093 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
17094 Lo = DAG.getBitcast(InVT, Lo);
17095 Hi = DAG.getBitcast(InVT, Hi);
17096 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17097 return DAG.getBitcast(DstVT, Res);
17100 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
17101 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
17102 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
17103 Lo = DAG.getBitcast(InVT, Lo);
17104 Hi = DAG.getBitcast(InVT, Hi);
17105 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17107 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
17108 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
17109 Res = DAG.getBitcast(MVT::v4i64, Res);
17110 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
17112 if (DstVT.is256BitVector())
17113 return DAG.getBitcast(DstVT, Res);
17115 // If 512bit -> 128bit truncate another stage.
17116 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17117 Res = DAG.getBitcast(PackedVT, Res);
17118 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17121 // Recursively pack lower/upper subvectors, concat result and pack again.
17122 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
17123 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
17124 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
17125 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
17127 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17128 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
17129 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17132 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
17133 const X86Subtarget &Subtarget) {
17136 MVT VT = Op.getSimpleValueType();
17137 SDValue In = Op.getOperand(0);
17138 MVT InVT = In.getSimpleValueType();
17140 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
17142 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
17143 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
17144 if (InVT.getScalarSizeInBits() <= 16) {
17145 if (Subtarget.hasBWI()) {
17146 // legal, will go to VPMOVB2M, VPMOVW2M
17147 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17148 // We need to shift to get the lsb into sign position.
17149 // Shift packed bytes not supported natively, bitcast to word
17150 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
17151 In = DAG.getNode(ISD::SHL, DL, ExtVT,
17152 DAG.getBitcast(ExtVT, In),
17153 DAG.getConstant(ShiftInx, DL, ExtVT));
17154 In = DAG.getBitcast(InVT, In);
17156 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17159 // Use TESTD/Q, extended vector to packed dword/qword.
17160 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
17161 "Unexpected vector type.");
17162 unsigned NumElts = InVT.getVectorNumElements();
17163 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
17164 // We need to change to a wider element type that we have support for.
17165 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
17166 // For 16 element vectors we extend to v16i32 unless we are explicitly
17167 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
17168 // we need to split into two 8 element vectors which we can extend to v8i32,
17169 // truncate and concat the results. There's an additional complication if
17170 // the original type is v16i8. In that case we can't split the v16i8 so
17171 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
17172 // to v8i32, truncate that to v8i1 and concat the two halves.
17173 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
17174 if (InVT == MVT::v16i8) {
17175 // First we need to sign extend up to 256-bits so we can split that.
17176 InVT = MVT::v16i16;
17177 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
17179 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
17180 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
17181 // We're split now, just emit two truncates and a concat. The two
17182 // truncates will trigger legalization to come back to this function.
17183 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
17184 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
17185 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17187 // We either have 8 elements or we're allowed to use 512-bit vectors.
17188 // If we have VLX, we want to use the narrowest vector that can get the
17189 // job done so we use vXi32.
17190 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
17191 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
17192 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
17194 ShiftInx = InVT.getScalarSizeInBits() - 1;
17197 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17198 // We need to shift to get the lsb into sign position.
17199 In = DAG.getNode(ISD::SHL, DL, InVT, In,
17200 DAG.getConstant(ShiftInx, DL, InVT));
17202 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
17203 if (Subtarget.hasDQI())
17204 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17206 return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
17210 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
17212 MVT VT = Op.getSimpleValueType();
17213 SDValue In = Op.getOperand(0);
17214 MVT InVT = In.getSimpleValueType();
17215 unsigned InNumEltBits = InVT.getScalarSizeInBits();
17217 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
17218 "Invalid TRUNCATE operation");
17220 if (VT.getVectorElementType() == MVT::i1)
17221 return LowerTruncateVecI1(Op, DAG, Subtarget);
17223 // vpmovqb/w/d, vpmovdb/w, vpmovwb
17224 if (Subtarget.hasAVX512()) {
17225 // word to byte only under BWI
17226 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
17227 // Make sure we're allowed to promote 512-bits.
17228 if (Subtarget.canExtendTo512DQ())
17229 return DAG.getNode(ISD::TRUNCATE, DL, VT,
17230 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
17236 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
17237 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
17239 // Truncate with PACKUS if we are truncating a vector with leading zero bits
17240 // that extend all the way to the packed/truncated value.
17241 // Pre-SSE41 we can only use PACKUSWB.
17243 DAG.computeKnownBits(In, Known);
17244 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
17246 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
17249 // Truncate with PACKSS if we are truncating a vector with sign-bits that
17250 // extend all the way to the packed/truncated value.
17251 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
17253 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
17256 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
17257 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
17258 if (Subtarget.hasInt256()) {
17259 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
17260 In = DAG.getBitcast(MVT::v8i32, In);
17261 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
17262 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
17263 DAG.getIntPtrConstant(0, DL));
17266 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17267 DAG.getIntPtrConstant(0, DL));
17268 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17269 DAG.getIntPtrConstant(2, DL));
17270 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17271 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17272 static const int ShufMask[] = {0, 2, 4, 6};
17273 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
17276 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
17277 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
17278 if (Subtarget.hasInt256()) {
17279 In = DAG.getBitcast(MVT::v32i8, In);
17281 // The PSHUFB mask:
17282 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
17283 -1, -1, -1, -1, -1, -1, -1, -1,
17284 16, 17, 20, 21, 24, 25, 28, 29,
17285 -1, -1, -1, -1, -1, -1, -1, -1 };
17286 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
17287 In = DAG.getBitcast(MVT::v4i64, In);
17289 static const int ShufMask2[] = {0, 2, -1, -1};
17290 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
17291 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17292 DAG.getIntPtrConstant(0, DL));
17293 return DAG.getBitcast(VT, In);
17296 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17297 DAG.getIntPtrConstant(0, DL));
17299 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17300 DAG.getIntPtrConstant(4, DL));
17302 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
17303 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
17305 // The PSHUFB mask:
17306 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
17307 -1, -1, -1, -1, -1, -1, -1, -1};
17309 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
17310 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
17312 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17313 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17315 // The MOVLHPS Mask:
17316 static const int ShufMask2[] = {0, 1, 4, 5};
17317 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
17318 return DAG.getBitcast(MVT::v8i16, res);
17321 // Handle truncation of V256 to V128 using shuffles.
17322 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
17324 assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
17326 unsigned NumElems = VT.getVectorNumElements();
17327 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
17329 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
17330 // Prepare truncation shuffle mask
17331 for (unsigned i = 0; i != NumElems; ++i)
17332 MaskVec[i] = i * 2;
17333 In = DAG.getBitcast(NVT, In);
17334 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
17335 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
17336 DAG.getIntPtrConstant(0, DL));
17339 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
17340 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
17341 MVT VT = Op.getSimpleValueType();
17343 if (VT.isVector()) {
17344 SDValue Src = Op.getOperand(0);
17347 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
17348 MVT ResVT = MVT::v4i32;
17349 MVT TruncVT = MVT::v4i1;
17350 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
17351 if (!IsSigned && !Subtarget.hasVLX()) {
17352 // Widen to 512-bits.
17353 ResVT = MVT::v8i32;
17354 TruncVT = MVT::v8i1;
17355 Opc = ISD::FP_TO_UINT;
17356 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
17357 DAG.getUNDEF(MVT::v8f64),
17358 Src, DAG.getIntPtrConstant(0, dl));
17360 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
17361 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
17362 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
17363 DAG.getIntPtrConstant(0, dl));
17366 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
17367 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
17368 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
17369 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
17370 DAG.getUNDEF(MVT::v2f32)));
17376 assert(!VT.isVector());
17378 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
17379 IsSigned, /*IsReplace=*/ false);
17380 SDValue FIST = Vals.first, StackSlot = Vals.second;
17381 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
17382 if (!FIST.getNode())
17385 if (StackSlot.getNode())
17386 // Load the result.
17387 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
17389 // The node is the result.
17393 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
17395 MVT VT = Op.getSimpleValueType();
17396 SDValue In = Op.getOperand(0);
17397 MVT SVT = In.getSimpleValueType();
17399 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
17401 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
17402 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
17403 In, DAG.getUNDEF(SVT)));
17406 /// The only differences between FABS and FNEG are the mask and the logic op.
17407 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
17408 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
17409 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
17410 "Wrong opcode for lowering FABS or FNEG.");
17412 bool IsFABS = (Op.getOpcode() == ISD::FABS);
17414 // If this is a FABS and it has an FNEG user, bail out to fold the combination
17415 // into an FNABS. We'll lower the FABS after that if it is still in use.
17417 for (SDNode *User : Op->uses())
17418 if (User->getOpcode() == ISD::FNEG)
17422 MVT VT = Op.getSimpleValueType();
17424 bool IsF128 = (VT == MVT::f128);
17426 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
17427 // decide if we should generate a 16-byte constant mask when we only need 4 or
17428 // 8 bytes for the scalar case.
17433 if (VT.isVector()) {
17435 EltVT = VT.getVectorElementType();
17436 } else if (IsF128) {
17437 // SSE instructions are used for optimized f128 logical operations.
17438 LogicVT = MVT::f128;
17441 // There are no scalar bitwise logical SSE/AVX instructions, so we
17442 // generate a 16-byte vector constant and logic op even for the scalar case.
17443 // Using a 16-byte mask allows folding the load of the mask with
17444 // the logic op, so it can save (~4 bytes) on code size.
17445 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17449 unsigned EltBits = EltVT.getSizeInBits();
17450 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
17452 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
17453 const fltSemantics &Sem =
17454 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
17455 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17456 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
17458 SDValue Op0 = Op.getOperand(0);
17459 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
17461 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
17462 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
17464 if (VT.isVector() || IsF128)
17465 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17467 // For the scalar case extend to a 128-bit vector, perform the logic op,
17468 // and extract the scalar result back out.
17469 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
17470 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17471 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
17472 DAG.getIntPtrConstant(0, dl));
17475 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
17476 SDValue Mag = Op.getOperand(0);
17477 SDValue Sign = Op.getOperand(1);
17480 // If the sign operand is smaller, extend it first.
17481 MVT VT = Op.getSimpleValueType();
17482 if (Sign.getSimpleValueType().bitsLT(VT))
17483 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
17485 // And if it is bigger, shrink it first.
17486 if (Sign.getSimpleValueType().bitsGT(VT))
17487 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
17489 // At this point the operands and the result should have the same
17490 // type, and that won't be f80 since that is not custom lowered.
17491 bool IsF128 = (VT == MVT::f128);
17492 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
17493 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
17494 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
17495 "Unexpected type in LowerFCOPYSIGN");
17497 MVT EltVT = VT.getScalarType();
17498 const fltSemantics &Sem =
17499 EltVT == MVT::f64 ? APFloat::IEEEdouble()
17500 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17502 // Perform all scalar logic operations as 16-byte vectors because there are no
17503 // scalar FP logic instructions in SSE.
17504 // TODO: This isn't necessary. If we used scalar types, we might avoid some
17505 // unnecessary splats, but we might miss load folding opportunities. Should
17506 // this decision be based on OptimizeForSize?
17507 bool IsFakeVector = !VT.isVector() && !IsF128;
17510 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17512 // The mask constants are automatically splatted for vector types.
17513 unsigned EltSizeInBits = VT.getScalarSizeInBits();
17514 SDValue SignMask = DAG.getConstantFP(
17515 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17516 SDValue MagMask = DAG.getConstantFP(
17517 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17519 // First, clear all bits but the sign bit from the second operand (sign).
17521 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
17522 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
17524 // Next, clear the sign bit from the first operand (magnitude).
17525 // TODO: If we had general constant folding for FP logic ops, this check
17526 // wouldn't be necessary.
17528 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
17529 APFloat APF = Op0CN->getValueAPF();
17531 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
17533 // If the magnitude operand wasn't a constant, we need to AND out the sign.
17535 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
17536 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
17539 // OR the magnitude value with the sign bit.
17540 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
17541 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
17542 DAG.getIntPtrConstant(0, dl));
17545 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
17546 SDValue N0 = Op.getOperand(0);
17548 MVT VT = Op.getSimpleValueType();
17550 MVT OpVT = N0.getSimpleValueType();
17551 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
17552 "Unexpected type for FGETSIGN");
17554 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
17555 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
17556 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
17557 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
17558 Res = DAG.getZExtOrTrunc(Res, dl, VT);
17559 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
17563 /// Helper for creating a X86ISD::SETCC node.
17564 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17565 SelectionDAG &DAG) {
17566 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17567 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17570 // Check whether an OR'd tree is PTEST-able.
17571 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
17572 const X86Subtarget &Subtarget,
17573 SelectionDAG &DAG) {
17574 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
17576 if (!Subtarget.hasSSE41())
17579 if (!Op->hasOneUse())
17582 SDNode *N = Op.getNode();
17585 SmallVector<SDValue, 8> Opnds;
17586 DenseMap<SDValue, unsigned> VecInMap;
17587 SmallVector<SDValue, 8> VecIns;
17588 EVT VT = MVT::Other;
17590 // Recognize a special case where a vector is casted into wide integer to
17592 Opnds.push_back(N->getOperand(0));
17593 Opnds.push_back(N->getOperand(1));
17595 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
17596 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
17597 // BFS traverse all OR'd operands.
17598 if (I->getOpcode() == ISD::OR) {
17599 Opnds.push_back(I->getOperand(0));
17600 Opnds.push_back(I->getOperand(1));
17601 // Re-evaluate the number of nodes to be traversed.
17602 e += 2; // 2 more nodes (LHS and RHS) are pushed.
17606 // Quit if a non-EXTRACT_VECTOR_ELT
17607 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17610 // Quit if without a constant index.
17611 SDValue Idx = I->getOperand(1);
17612 if (!isa<ConstantSDNode>(Idx))
17615 SDValue ExtractedFromVec = I->getOperand(0);
17616 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
17617 if (M == VecInMap.end()) {
17618 VT = ExtractedFromVec.getValueType();
17619 // Quit if not 128/256-bit vector.
17620 if (!VT.is128BitVector() && !VT.is256BitVector())
17622 // Quit if not the same type.
17623 if (VecInMap.begin() != VecInMap.end() &&
17624 VT != VecInMap.begin()->first.getValueType())
17626 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
17627 VecIns.push_back(ExtractedFromVec);
17629 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
17632 assert((VT.is128BitVector() || VT.is256BitVector()) &&
17633 "Not extracted from 128-/256-bit vector.");
17635 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
17637 for (DenseMap<SDValue, unsigned>::const_iterator
17638 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
17639 // Quit if not all elements are used.
17640 if (I->second != FullMask)
17644 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
17646 // Cast all vectors into TestVT for PTEST.
17647 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
17648 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
17650 // If more than one full vector is evaluated, OR them first before PTEST.
17651 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
17652 // Each iteration will OR 2 nodes and append the result until there is only
17653 // 1 node left, i.e. the final OR'd value of all vectors.
17654 SDValue LHS = VecIns[Slot];
17655 SDValue RHS = VecIns[Slot + 1];
17656 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
17659 SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
17660 VecIns.back(), VecIns.back());
17661 return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
17664 /// return true if \c Op has a use that doesn't just read flags.
17665 static bool hasNonFlagsUse(SDValue Op) {
17666 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
17668 SDNode *User = *UI;
17669 unsigned UOpNo = UI.getOperandNo();
17670 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
17671 // Look pass truncate.
17672 UOpNo = User->use_begin().getOperandNo();
17673 User = *User->use_begin();
17676 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
17677 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
17683 /// Emit nodes that will be selected as "test Op0,Op0", or something
17685 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
17686 SelectionDAG &DAG) const {
17687 // CF and OF aren't always set the way we want. Determine which
17688 // of these we need.
17689 bool NeedCF = false;
17690 bool NeedOF = false;
17693 case X86::COND_A: case X86::COND_AE:
17694 case X86::COND_B: case X86::COND_BE:
17697 case X86::COND_G: case X86::COND_GE:
17698 case X86::COND_L: case X86::COND_LE:
17699 case X86::COND_O: case X86::COND_NO: {
17700 // Check if we really need to set the
17701 // Overflow flag. If NoSignedWrap is present
17702 // that is not actually needed.
17703 switch (Op->getOpcode()) {
17708 if (Op.getNode()->getFlags().hasNoSignedWrap())
17718 // See if we can use the EFLAGS value from the operand instead of
17719 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
17720 // we prove that the arithmetic won't overflow, we can't use OF or CF.
17721 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
17722 // Emit a CMP with 0, which is the TEST pattern.
17723 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17724 DAG.getConstant(0, dl, Op.getValueType()));
17726 unsigned Opcode = 0;
17727 unsigned NumOperands = 0;
17729 // Truncate operations may prevent the merge of the SETCC instruction
17730 // and the arithmetic instruction before it. Attempt to truncate the operands
17731 // of the arithmetic instruction and use a reduced bit-width instruction.
17732 bool NeedTruncation = false;
17733 SDValue ArithOp = Op;
17734 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
17735 SDValue Arith = Op->getOperand(0);
17736 // Both the trunc and the arithmetic op need to have one user each.
17737 if (Arith->hasOneUse())
17738 switch (Arith.getOpcode()) {
17745 NeedTruncation = true;
17751 // Sometimes flags can be set either with an AND or with an SRL/SHL
17752 // instruction. SRL/SHL variant should be preferred for masks longer than this
17754 const int ShiftToAndMaxMaskWidth = 32;
17755 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
17757 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
17758 // which may be the result of a CAST. We use the variable 'Op', which is the
17759 // non-casted variable when we check for possible users.
17760 switch (ArithOp.getOpcode()) {
17762 // We only want to rewrite this as a target-specific node with attached
17763 // flags if there is a reasonable chance of either using that to do custom
17764 // instructions selection that can fold some of the memory operands, or if
17765 // only the flags are used. If there are other uses, leave the node alone
17766 // and emit a test instruction.
17767 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17768 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17769 if (UI->getOpcode() != ISD::CopyToReg &&
17770 UI->getOpcode() != ISD::SETCC &&
17771 UI->getOpcode() != ISD::STORE)
17774 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17775 // An add of one will be selected as an INC.
17777 (!Subtarget.slowIncDec() ||
17778 DAG.getMachineFunction().getFunction().optForSize())) {
17779 Opcode = X86ISD::INC;
17784 // An add of negative one (subtract of one) will be selected as a DEC.
17785 if (C->isAllOnesValue() &&
17786 (!Subtarget.slowIncDec() ||
17787 DAG.getMachineFunction().getFunction().optForSize())) {
17788 Opcode = X86ISD::DEC;
17794 // Otherwise use a regular EFLAGS-setting add.
17795 Opcode = X86ISD::ADD;
17800 // If we have a constant logical shift that's only used in a comparison
17801 // against zero turn it into an equivalent AND. This allows turning it into
17802 // a TEST instruction later.
17803 if (ZeroCheck && Op->hasOneUse() &&
17804 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17805 EVT VT = Op.getValueType();
17806 unsigned BitWidth = VT.getSizeInBits();
17807 unsigned ShAmt = Op->getConstantOperandVal(1);
17808 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17810 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17811 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17812 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17813 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17815 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17816 DAG.getConstant(Mask, dl, VT));
17821 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17822 // because a TEST instruction will be better. However, AND should be
17823 // preferred if the instruction can be combined into ANDN.
17824 if (!hasNonFlagsUse(Op)) {
17825 SDValue Op0 = ArithOp->getOperand(0);
17826 SDValue Op1 = ArithOp->getOperand(1);
17827 EVT VT = ArithOp.getValueType();
17828 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17829 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17830 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17832 // If we cannot select an ANDN instruction, check if we can replace
17833 // AND+IMM64 with a shift before giving up. This is possible for masks
17834 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17835 if (!isProperAndn) {
17839 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17840 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17844 const APInt &Mask = CN->getAPIntValue();
17845 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17846 break; // Prefer TEST instruction.
17848 unsigned BitWidth = Mask.getBitWidth();
17849 unsigned LeadingOnes = Mask.countLeadingOnes();
17850 unsigned TrailingZeros = Mask.countTrailingZeros();
17852 if (LeadingOnes + TrailingZeros == BitWidth) {
17853 assert(TrailingZeros < VT.getSizeInBits() &&
17854 "Shift amount should be less than the type width");
17855 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17856 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17857 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17861 unsigned LeadingZeros = Mask.countLeadingZeros();
17862 unsigned TrailingOnes = Mask.countTrailingOnes();
17864 if (LeadingZeros + TrailingOnes == BitWidth) {
17865 assert(LeadingZeros < VT.getSizeInBits() &&
17866 "Shift amount should be less than the type width");
17867 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17868 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17869 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17880 // Similar to ISD::ADD above, check if the uses will preclude useful
17881 // lowering of the target-specific node.
17882 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17883 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17884 if (UI->getOpcode() != ISD::CopyToReg &&
17885 UI->getOpcode() != ISD::SETCC &&
17886 UI->getOpcode() != ISD::STORE)
17889 // Otherwise use a regular EFLAGS-setting instruction.
17890 switch (ArithOp.getOpcode()) {
17891 default: llvm_unreachable("unexpected operator!");
17892 case ISD::SUB: Opcode = X86ISD::SUB; break;
17893 case ISD::XOR: Opcode = X86ISD::XOR; break;
17894 case ISD::AND: Opcode = X86ISD::AND; break;
17895 case ISD::OR: Opcode = X86ISD::OR; break;
17907 return SDValue(Op.getNode(), 1);
17913 // If we found that truncation is beneficial, perform the truncation and
17915 if (NeedTruncation) {
17916 EVT VT = Op.getValueType();
17917 SDValue WideVal = Op->getOperand(0);
17918 EVT WideVT = WideVal.getValueType();
17919 unsigned ConvertedOp = 0;
17920 // Use a target machine opcode to prevent further DAGCombine
17921 // optimizations that may separate the arithmetic operations
17922 // from the setcc node.
17923 switch (WideVal.getOpcode()) {
17925 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17926 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17927 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17928 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17929 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17933 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17934 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17935 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17936 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17937 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17938 Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
17944 // Emit a CMP with 0, which is the TEST pattern.
17945 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17946 DAG.getConstant(0, dl, Op.getValueType()));
17948 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17949 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17951 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17952 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
17953 return SDValue(New.getNode(), 1);
17956 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17958 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17959 const SDLoc &dl, SelectionDAG &DAG) const {
17960 if (isNullConstant(Op1))
17961 return EmitTest(Op0, X86CC, dl, DAG);
17963 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17964 "Unexpected comparison operation for MVT::i1 operands");
17966 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17967 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17968 // Only promote the compare up to I32 if it is a 16 bit operation
17969 // with an immediate. 16 bit immediates are to be avoided.
17970 if ((Op0.getValueType() == MVT::i16 &&
17971 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17972 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17973 !Subtarget.isAtom()) {
17974 unsigned ExtendOp =
17975 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17976 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17977 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17979 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17980 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17981 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17982 return SDValue(Sub.getNode(), 1);
17984 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17987 /// Convert a comparison if required by the subtarget.
17988 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17989 SelectionDAG &DAG) const {
17990 // If the subtarget does not support the FUCOMI instruction, floating-point
17991 // comparisons have to be converted.
17992 if (Subtarget.hasCMov() ||
17993 Cmp.getOpcode() != X86ISD::CMP ||
17994 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17995 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17998 // The instruction selector will select an FUCOM instruction instead of
17999 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
18000 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
18001 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
18003 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
18004 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
18005 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
18006 DAG.getConstant(8, dl, MVT::i8));
18007 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
18009 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
18010 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
18011 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
18014 /// Check if replacement of SQRT with RSQRT should be disabled.
18015 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
18016 EVT VT = Op.getValueType();
18018 // We never want to use both SQRT and RSQRT instructions for the same input.
18019 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
18023 return Subtarget.hasFastVectorFSQRT();
18024 return Subtarget.hasFastScalarFSQRT();
18027 /// The minimum architected relative accuracy is 2^-12. We need one
18028 /// Newton-Raphson step to have a good float result (24 bits of precision).
18029 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
18030 SelectionDAG &DAG, int Enabled,
18031 int &RefinementSteps,
18032 bool &UseOneConstNR,
18033 bool Reciprocal) const {
18034 EVT VT = Op.getValueType();
18036 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
18037 // It is likely not profitable to do this for f64 because a double-precision
18038 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
18039 // instructions: convert to single, rsqrtss, convert back to double, refine
18040 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
18041 // along with FMA, this could be a throughput win.
18042 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
18043 // after legalize types.
18044 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
18045 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
18046 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
18047 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
18048 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
18049 if (RefinementSteps == ReciprocalEstimate::Unspecified)
18050 RefinementSteps = 1;
18052 UseOneConstNR = false;
18053 // There is no FSQRT for 512-bits, but there is RSQRT14.
18054 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
18055 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
18060 /// The minimum architected relative accuracy is 2^-12. We need one
18061 /// Newton-Raphson step to have a good float result (24 bits of precision).
18062 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
18064 int &RefinementSteps) const {
18065 EVT VT = Op.getValueType();
18067 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
18068 // It is likely not profitable to do this for f64 because a double-precision
18069 // reciprocal estimate with refinement on x86 prior to FMA requires
18070 // 15 instructions: convert to single, rcpss, convert back to double, refine
18071 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
18072 // along with FMA, this could be a throughput win.
18074 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
18075 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
18076 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
18077 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
18078 // Enable estimate codegen with 1 refinement step for vector division.
18079 // Scalar division estimates are disabled because they break too much
18080 // real-world code. These defaults are intended to match GCC behavior.
18081 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
18084 if (RefinementSteps == ReciprocalEstimate::Unspecified)
18085 RefinementSteps = 1;
18087 // There is no FSQRT for 512-bits, but there is RCP14.
18088 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
18089 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
18094 /// If we have at least two divisions that use the same divisor, convert to
18095 /// multiplication by a reciprocal. This may need to be adjusted for a given
18096 /// CPU if a division's cost is not at least twice the cost of a multiplication.
18097 /// This is because we still need one division to calculate the reciprocal and
18098 /// then we need two multiplies by that reciprocal as replacements for the
18099 /// original divisions.
18100 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
18104 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
18105 /// according to equal/not-equal condition code \p CC.
18106 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
18107 const SDLoc &dl, SelectionDAG &DAG) {
18108 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
18109 // instruction. Since the shift amount is in-range-or-undefined, we know
18110 // that doing a bittest on the i32 value is ok. We extend to i32 because
18111 // the encoding for the i16 version is larger than the i32 version.
18112 // Also promote i16 to i32 for performance / code size reason.
18113 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
18114 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
18116 // See if we can use the 32-bit instruction instead of the 64-bit one for a
18117 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
18118 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
18119 // known to be zero.
18120 if (Src.getValueType() == MVT::i64 &&
18121 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
18122 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
18124 // If the operand types disagree, extend the shift amount to match. Since
18125 // BT ignores high bits (like shifts) we can use anyextend.
18126 if (Src.getValueType() != BitNo.getValueType())
18127 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
18129 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
18130 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
18131 return getSETCC(Cond, BT, dl , DAG);
18134 /// Result of 'and' is compared against zero. Change to a BT node if possible.
18135 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
18136 const SDLoc &dl, SelectionDAG &DAG) {
18137 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
18138 SDValue Op0 = And.getOperand(0);
18139 SDValue Op1 = And.getOperand(1);
18140 if (Op0.getOpcode() == ISD::TRUNCATE)
18141 Op0 = Op0.getOperand(0);
18142 if (Op1.getOpcode() == ISD::TRUNCATE)
18143 Op1 = Op1.getOperand(0);
18146 if (Op1.getOpcode() == ISD::SHL)
18147 std::swap(Op0, Op1);
18148 if (Op0.getOpcode() == ISD::SHL) {
18149 if (isOneConstant(Op0.getOperand(0))) {
18150 // If we looked past a truncate, check that it's only truncating away
18152 unsigned BitWidth = Op0.getValueSizeInBits();
18153 unsigned AndBitWidth = And.getValueSizeInBits();
18154 if (BitWidth > AndBitWidth) {
18156 DAG.computeKnownBits(Op0, Known);
18157 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
18161 RHS = Op0.getOperand(1);
18163 } else if (Op1.getOpcode() == ISD::Constant) {
18164 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
18165 uint64_t AndRHSVal = AndRHS->getZExtValue();
18166 SDValue AndLHS = Op0;
18168 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
18169 LHS = AndLHS.getOperand(0);
18170 RHS = AndLHS.getOperand(1);
18172 // Use BT if the immediate can't be encoded in a TEST instruction or we
18173 // are optimizing for size and the immedaite won't fit in a byte.
18174 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
18175 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
18176 isPowerOf2_64(AndRHSVal)) {
18178 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
18184 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
18189 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
18191 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
18196 // SSE Condition code mapping:
18205 switch (SetCCOpcode) {
18206 default: llvm_unreachable("Unexpected SETCC condition");
18208 case ISD::SETEQ: SSECC = 0; break;
18210 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
18212 case ISD::SETOLT: SSECC = 1; break;
18214 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
18216 case ISD::SETOLE: SSECC = 2; break;
18217 case ISD::SETUO: SSECC = 3; break;
18219 case ISD::SETNE: SSECC = 4; break;
18220 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
18221 case ISD::SETUGE: SSECC = 5; break;
18222 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
18223 case ISD::SETUGT: SSECC = 6; break;
18224 case ISD::SETO: SSECC = 7; break;
18225 case ISD::SETUEQ: SSECC = 8; break;
18226 case ISD::SETONE: SSECC = 12; break;
18229 std::swap(Op0, Op1);
18234 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
18235 /// concatenate the result back.
18236 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
18237 MVT VT = Op.getSimpleValueType();
18239 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
18240 "Unsupported value type for operation");
18242 unsigned NumElems = VT.getVectorNumElements();
18244 SDValue CC = Op.getOperand(2);
18246 // Extract the LHS vectors
18247 SDValue LHS = Op.getOperand(0);
18248 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
18249 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
18251 // Extract the RHS vectors
18252 SDValue RHS = Op.getOperand(1);
18253 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
18254 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
18256 // Issue the operation on the smaller types and concatenate the result back
18257 MVT EltVT = VT.getVectorElementType();
18258 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18259 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18260 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
18261 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
18264 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
18266 SDValue Op0 = Op.getOperand(0);
18267 SDValue Op1 = Op.getOperand(1);
18268 SDValue CC = Op.getOperand(2);
18269 MVT VT = Op.getSimpleValueType();
18272 assert(VT.getVectorElementType() == MVT::i1 &&
18273 "Cannot set masked compare for this operation");
18275 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
18277 // If this is a seteq make sure any build vectors of all zeros are on the RHS.
18278 // This helps with vptestm matching.
18279 // TODO: Should we just canonicalize the setcc during DAG combine?
18280 if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
18281 ISD::isBuildVectorAllZeros(Op0.getNode()))
18282 std::swap(Op0, Op1);
18284 // Prefer SETGT over SETLT.
18285 if (SetCCOpcode == ISD::SETLT) {
18286 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
18287 std::swap(Op0, Op1);
18290 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
18293 /// Try to turn a VSETULT into a VSETULE by modifying its second
18294 /// operand \p Op1. If non-trivial (for example because it's not constant)
18295 /// return an empty value.
18296 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
18297 SelectionDAG &DAG) {
18298 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
18302 MVT VT = Op1.getSimpleValueType();
18303 MVT EVT = VT.getVectorElementType();
18304 unsigned n = VT.getVectorNumElements();
18305 SmallVector<SDValue, 8> ULTOp1;
18307 for (unsigned i = 0; i < n; ++i) {
18308 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
18309 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
18312 // Avoid underflow.
18313 APInt Val = Elt->getAPIntValue();
18317 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
18320 return DAG.getBuildVector(VT, dl, ULTOp1);
18323 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
18325 /// t = psubus Op0, Op1
18326 /// pcmpeq t, <0..0>
18327 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
18328 ISD::CondCode Cond, const SDLoc &dl,
18329 const X86Subtarget &Subtarget,
18330 SelectionDAG &DAG) {
18331 if (!Subtarget.hasSSE2())
18334 MVT VET = VT.getVectorElementType();
18335 if (VET != MVT::i8 && VET != MVT::i16)
18341 case ISD::SETULT: {
18342 // If the comparison is against a constant we can turn this into a
18343 // setule. With psubus, setule does not require a swap. This is
18344 // beneficial because the constant in the register is no longer
18345 // destructed as the destination so it can be hoisted out of a loop.
18346 // Only do this pre-AVX since vpcmp* is no longer destructive.
18347 if (Subtarget.hasAVX())
18349 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
18355 // Psubus is better than flip-sign because it requires no inversion.
18357 std::swap(Op0, Op1);
18363 SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
18364 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
18365 getZeroVector(VT, Subtarget, DAG, dl));
18368 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
18369 SelectionDAG &DAG) {
18370 SDValue Op0 = Op.getOperand(0);
18371 SDValue Op1 = Op.getOperand(1);
18372 SDValue CC = Op.getOperand(2);
18373 MVT VT = Op.getSimpleValueType();
18374 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
18375 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
18380 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
18381 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
18385 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
18386 assert(VT.getVectorNumElements() <= 16);
18387 Opc = X86ISD::CMPM;
18389 Opc = X86ISD::CMPP;
18390 // The SSE/AVX packed FP comparison nodes are defined with a
18391 // floating-point vector result that matches the operand type. This allows
18392 // them to work with an SSE1 target (integer vector types are not legal).
18393 VT = Op0.getSimpleValueType();
18396 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
18397 // emit two comparisons and a logic op to tie them together.
18399 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
18400 if (SSECC >= 8 && !Subtarget.hasAVX()) {
18401 // LLVM predicate is SETUEQ or SETONE.
18403 unsigned CombineOpc;
18404 if (Cond == ISD::SETUEQ) {
18407 CombineOpc = X86ISD::FOR;
18409 assert(Cond == ISD::SETONE);
18412 CombineOpc = X86ISD::FAND;
18415 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18416 DAG.getConstant(CC0, dl, MVT::i8));
18417 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18418 DAG.getConstant(CC1, dl, MVT::i8));
18419 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
18421 // Handle all other FP comparisons here.
18422 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
18423 DAG.getConstant(SSECC, dl, MVT::i8));
18426 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
18427 // result type of SETCC. The bitcast is expected to be optimized away
18428 // during combining/isel.
18429 if (Opc == X86ISD::CMPP)
18430 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
18435 MVT VTOp0 = Op0.getSimpleValueType();
18436 assert(VTOp0 == Op1.getSimpleValueType() &&
18437 "Expected operands with same type!");
18438 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
18439 "Invalid number of packed elements for source and destination!");
18441 // This is being called by type legalization because v2i32 is marked custom
18442 // for result type legalization for v2f32.
18443 if (VTOp0 == MVT::v2i32)
18446 // The non-AVX512 code below works under the assumption that source and
18447 // destination types are the same.
18448 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
18449 "Value types for source and destination must be the same!");
18451 // Break 256-bit integer vector compare into smaller ones.
18452 if (VT.is256BitVector() && !Subtarget.hasInt256())
18453 return Lower256IntVSETCC(Op, DAG);
18455 // The result is boolean, but operands are int/float
18456 if (VT.getVectorElementType() == MVT::i1) {
18457 // In AVX-512 architecture setcc returns mask with i1 elements,
18458 // But there is no compare instruction for i8 and i16 elements in KNL.
18459 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
18460 "Unexpected operand type");
18461 return LowerIntVSETCC_AVX512(Op, DAG);
18464 // Lower using XOP integer comparisons.
18465 if (VT.is128BitVector() && Subtarget.hasXOP()) {
18466 // Translate compare code to XOP PCOM compare mode.
18467 unsigned CmpMode = 0;
18469 default: llvm_unreachable("Unexpected SETCC condition");
18471 case ISD::SETLT: CmpMode = 0x00; break;
18473 case ISD::SETLE: CmpMode = 0x01; break;
18475 case ISD::SETGT: CmpMode = 0x02; break;
18477 case ISD::SETGE: CmpMode = 0x03; break;
18478 case ISD::SETEQ: CmpMode = 0x04; break;
18479 case ISD::SETNE: CmpMode = 0x05; break;
18482 // Are we comparing unsigned or signed integers?
18484 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
18486 return DAG.getNode(Opc, dl, VT, Op0, Op1,
18487 DAG.getConstant(CmpMode, dl, MVT::i8));
18490 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
18491 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
18492 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
18493 SDValue BC0 = peekThroughBitcasts(Op0);
18494 if (BC0.getOpcode() == ISD::AND) {
18496 SmallVector<APInt, 64> EltBits;
18497 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
18498 VT.getScalarSizeInBits(), UndefElts,
18499 EltBits, false, false)) {
18500 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
18502 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
18508 // If this is a SETNE against the signed minimum value, change it to SETGT.
18509 // If this is a SETNE against the signed maximum value, change it to SETLT.
18510 // which will be swapped to SETGT.
18511 // Otherwise we use PCMPEQ+invert.
18513 if (Cond == ISD::SETNE &&
18514 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
18515 if (ConstValue.isMinSignedValue())
18517 else if (ConstValue.isMaxSignedValue())
18521 // If both operands are known non-negative, then an unsigned compare is the
18522 // same as a signed compare and there's no need to flip signbits.
18523 // TODO: We could check for more general simplifications here since we're
18524 // computing known bits.
18525 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
18526 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
18528 // Special case: Use min/max operations for unsigned compares. We only want
18529 // to do this for unsigned compares if we need to flip signs or if it allows
18530 // use to avoid an invert.
18531 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18532 if (ISD::isUnsignedIntSetCC(Cond) &&
18533 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
18534 TLI.isOperationLegal(ISD::UMIN, VT)) {
18535 bool Invert = false;
18538 default: llvm_unreachable("Unexpected condition code");
18539 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
18540 case ISD::SETULE: Opc = ISD::UMIN; break;
18541 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
18542 case ISD::SETUGE: Opc = ISD::UMAX; break;
18545 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18546 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
18548 // If the logical-not of the result is required, perform that now.
18550 Result = DAG.getNOT(dl, Result, VT);
18555 // Try to use SUBUS and PCMPEQ.
18556 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
18559 // We are handling one of the integer comparisons here. Since SSE only has
18560 // GT and EQ comparisons for integer, swapping operands and multiple
18561 // operations may be required for some comparisons.
18562 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
18564 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
18565 Cond == ISD::SETGE || Cond == ISD::SETUGE;
18566 bool Invert = Cond == ISD::SETNE ||
18567 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
18570 std::swap(Op0, Op1);
18572 // Check that the operation in question is available (most are plain SSE2,
18573 // but PCMPGTQ and PCMPEQQ have different requirements).
18574 if (VT == MVT::v2i64) {
18575 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
18576 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
18578 // First cast everything to the right type.
18579 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18580 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18582 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18583 // bits of the inputs before performing those operations. The lower
18584 // compare is always unsigned.
18587 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
18589 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
18590 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
18591 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
18593 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
18594 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
18596 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
18597 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
18598 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
18600 // Create masks for only the low parts/high parts of the 64 bit integers.
18601 static const int MaskHi[] = { 1, 1, 3, 3 };
18602 static const int MaskLo[] = { 0, 0, 2, 2 };
18603 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
18604 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
18605 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
18607 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
18608 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
18611 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18613 return DAG.getBitcast(VT, Result);
18616 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
18617 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
18618 // pcmpeqd + pshufd + pand.
18619 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
18621 // First cast everything to the right type.
18622 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18623 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18626 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
18628 // Make sure the lower and upper halves are both all-ones.
18629 static const int Mask[] = { 1, 0, 3, 2 };
18630 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
18631 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
18634 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18636 return DAG.getBitcast(VT, Result);
18640 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18641 // bits of the inputs before performing those operations.
18643 MVT EltVT = VT.getVectorElementType();
18644 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
18646 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
18647 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
18650 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18652 // If the logical-not of the result is required, perform that now.
18654 Result = DAG.getNOT(dl, Result, VT);
18659 // Try to select this as a KTEST+SETCC if possible.
18660 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
18661 const SDLoc &dl, SelectionDAG &DAG,
18662 const X86Subtarget &Subtarget) {
18663 // Only support equality comparisons.
18664 if (CC != ISD::SETEQ && CC != ISD::SETNE)
18667 // Must be a bitcast from vXi1.
18668 if (Op0.getOpcode() != ISD::BITCAST)
18671 Op0 = Op0.getOperand(0);
18672 MVT VT = Op0.getSimpleValueType();
18673 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
18674 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
18675 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
18678 X86::CondCode X86CC;
18679 if (isNullConstant(Op1)) {
18680 X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
18681 } else if (isAllOnesConstant(Op1)) {
18682 // C flag is set for all ones.
18683 X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
18687 // If the input is an OR, we can combine it's operands into the KORTEST.
18690 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
18691 LHS = Op0.getOperand(0);
18692 RHS = Op0.getOperand(1);
18695 SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18696 return getSETCC(X86CC, KORTEST, dl, DAG);
18699 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
18701 MVT VT = Op.getSimpleValueType();
18703 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
18705 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
18706 SDValue Op0 = Op.getOperand(0);
18707 SDValue Op1 = Op.getOperand(1);
18709 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18711 // Optimize to BT if possible.
18712 // Lower (X & (1 << N)) == 0 to BT(X, N).
18713 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
18714 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
18715 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
18716 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18717 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
18721 // Try to use PTEST for a tree ORs equality compared with 0.
18722 // TODO: We could do AND tree with all 1s as well by using the C flag.
18723 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
18724 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18725 if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
18729 // Try to lower using KTEST.
18730 if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
18733 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
18735 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
18736 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18738 // If the input is a setcc, then reuse the input setcc or use a new one with
18739 // the inverted condition.
18740 if (Op0.getOpcode() == X86ISD::SETCC) {
18741 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18742 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18746 CCode = X86::GetOppositeBranchCondition(CCode);
18747 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18751 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18752 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18753 if (X86CC == X86::COND_INVALID)
18756 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18757 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18758 return getSETCC(X86CC, EFLAGS, dl, DAG);
18761 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18762 SDValue LHS = Op.getOperand(0);
18763 SDValue RHS = Op.getOperand(1);
18764 SDValue Carry = Op.getOperand(2);
18765 SDValue Cond = Op.getOperand(3);
18768 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18769 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18771 // Recreate the carry if needed.
18772 EVT CarryVT = Carry.getValueType();
18773 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18774 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18775 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18777 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18778 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18779 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18782 /// Return true if opcode is a X86 logical comparison.
18783 static bool isX86LogicalCmp(SDValue Op) {
18784 unsigned Opc = Op.getOpcode();
18785 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18786 Opc == X86ISD::SAHF)
18788 if (Op.getResNo() == 1 &&
18789 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18790 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18791 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18792 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18795 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18801 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18802 if (V.getOpcode() != ISD::TRUNCATE)
18805 SDValue VOp0 = V.getOperand(0);
18806 unsigned InBits = VOp0.getValueSizeInBits();
18807 unsigned Bits = V.getValueSizeInBits();
18808 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18811 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18812 bool AddTest = true;
18813 SDValue Cond = Op.getOperand(0);
18814 SDValue Op1 = Op.getOperand(1);
18815 SDValue Op2 = Op.getOperand(2);
18817 MVT VT = Op1.getSimpleValueType();
18820 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18821 // are available or VBLENDV if AVX is available.
18822 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18823 if (Cond.getOpcode() == ISD::SETCC &&
18824 ((Subtarget.hasSSE2() && VT == MVT::f64) ||
18825 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18826 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18827 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18828 unsigned SSECC = translateX86FSETCC(
18829 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18831 if (Subtarget.hasAVX512()) {
18832 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18833 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18834 assert(!VT.isVector() && "Not a scalar type?");
18835 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18838 if (SSECC < 8 || Subtarget.hasAVX()) {
18839 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18840 DAG.getConstant(SSECC, DL, MVT::i8));
18842 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18843 // of 3 logic instructions for size savings and potentially speed.
18844 // Unfortunately, there is no scalar form of VBLENDV.
18846 // If either operand is a constant, don't try this. We can expect to
18847 // optimize away at least one of the logic instructions later in that
18848 // case, so that sequence would be faster than a variable blend.
18850 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18851 // uses XMM0 as the selection register. That may need just as many
18852 // instructions as the AND/ANDN/OR sequence due to register moves, so
18855 if (Subtarget.hasAVX() &&
18856 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18858 // Convert to vectors, do a VSELECT, and convert back to scalar.
18859 // All of the conversions should be optimized away.
18861 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18862 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18863 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18864 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18866 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18867 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18869 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18871 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18872 VSel, DAG.getIntPtrConstant(0, DL));
18874 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18875 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18876 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18880 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18881 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18882 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18883 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18886 // For v64i1 without 64-bit support we need to split and rejoin.
18887 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
18888 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
18889 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
18890 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
18891 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
18892 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
18893 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
18894 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
18895 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18898 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18900 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18901 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18902 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18903 Op1Scalar = Op1.getOperand(0);
18905 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18906 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18907 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18908 Op2Scalar = Op2.getOperand(0);
18909 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18910 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18911 Op1Scalar, Op2Scalar);
18912 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18913 return DAG.getBitcast(VT, newSelect);
18914 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18915 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18916 DAG.getIntPtrConstant(0, DL));
18920 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18921 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18922 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18923 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18924 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18925 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18926 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18927 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18930 if (Cond.getOpcode() == ISD::SETCC) {
18931 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18933 // If the condition was updated, it's possible that the operands of the
18934 // select were also updated (for example, EmitTest has a RAUW). Refresh
18935 // the local references to the select operands in case they got stale.
18936 Op1 = Op.getOperand(1);
18937 Op2 = Op.getOperand(2);
18941 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18942 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18943 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18944 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18945 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18946 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18947 if (Cond.getOpcode() == X86ISD::SETCC &&
18948 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18949 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18950 SDValue Cmp = Cond.getOperand(1);
18951 unsigned CondCode =
18952 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18954 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18955 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18956 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18957 SDValue CmpOp0 = Cmp.getOperand(0);
18959 // Apply further optimizations for special cases
18960 // (select (x != 0), -1, 0) -> neg & sbb
18961 // (select (x == 0), 0, -1) -> neg & sbb
18962 if (isNullConstant(Y) &&
18963 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18964 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18965 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18966 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18967 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18968 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18969 SDValue(Neg.getNode(), 1));
18973 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18974 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18975 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18977 SDValue Res = // Res = 0 or -1.
18978 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18979 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18981 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18982 Res = DAG.getNOT(DL, Res, Res.getValueType());
18984 if (!isNullConstant(Op2))
18985 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18987 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18988 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18989 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18990 SDValue CmpOp0 = Cmp.getOperand(0);
18991 SDValue Src1, Src2;
18992 // true if Op2 is XOR or OR operator and one of its operands
18994 // ( a , a op b) || ( b , a op b)
18995 auto isOrXorPattern = [&]() {
18996 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18997 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18999 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
19006 if (isOrXorPattern()) {
19008 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
19009 // we need mask of all zeros or ones with same size of the other
19011 if (CmpSz > VT.getSizeInBits())
19012 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
19013 else if (CmpSz < VT.getSizeInBits())
19014 Neg = DAG.getNode(ISD::AND, DL, VT,
19015 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
19016 DAG.getConstant(1, DL, VT));
19019 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
19020 Neg); // -(and (x, 0x1))
19021 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
19022 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
19027 // Look past (and (setcc_carry (cmp ...)), 1).
19028 if (Cond.getOpcode() == ISD::AND &&
19029 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19030 isOneConstant(Cond.getOperand(1)))
19031 Cond = Cond.getOperand(0);
19033 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19034 // setting operand in place of the X86ISD::SETCC.
19035 unsigned CondOpcode = Cond.getOpcode();
19036 if (CondOpcode == X86ISD::SETCC ||
19037 CondOpcode == X86ISD::SETCC_CARRY) {
19038 CC = Cond.getOperand(0);
19040 SDValue Cmp = Cond.getOperand(1);
19041 unsigned Opc = Cmp.getOpcode();
19042 MVT VT = Op.getSimpleValueType();
19044 bool IllegalFPCMov = false;
19045 if (VT.isFloatingPoint() && !VT.isVector() &&
19046 !isScalarFPTypeInSSEReg(VT)) // FPStack?
19047 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
19049 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
19050 Opc == X86ISD::BT) { // FIXME
19054 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19055 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19056 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19057 Cond.getOperand(0).getValueType() != MVT::i8)) {
19058 SDValue LHS = Cond.getOperand(0);
19059 SDValue RHS = Cond.getOperand(1);
19060 unsigned X86Opcode;
19063 switch (CondOpcode) {
19064 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19065 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19066 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19067 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19068 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19069 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19070 default: llvm_unreachable("unexpected overflowing operator");
19072 if (CondOpcode == ISD::UMULO)
19073 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19076 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19078 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
19080 if (CondOpcode == ISD::UMULO)
19081 Cond = X86Op.getValue(2);
19083 Cond = X86Op.getValue(1);
19085 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
19090 // Look past the truncate if the high bits are known zero.
19091 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19092 Cond = Cond.getOperand(0);
19094 // We know the result of AND is compared against zero. Try to match
19096 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19097 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
19098 CC = NewSetCC.getOperand(0);
19099 Cond = NewSetCC.getOperand(1);
19106 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
19107 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
19110 // a < b ? -1 : 0 -> RES = ~setcc_carry
19111 // a < b ? 0 : -1 -> RES = setcc_carry
19112 // a >= b ? -1 : 0 -> RES = setcc_carry
19113 // a >= b ? 0 : -1 -> RES = ~setcc_carry
19114 if (Cond.getOpcode() == X86ISD::SUB) {
19115 Cond = ConvertCmpIfNecessary(Cond, DAG);
19116 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
19118 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
19119 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
19120 (isNullConstant(Op1) || isNullConstant(Op2))) {
19121 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
19122 DAG.getConstant(X86::COND_B, DL, MVT::i8),
19124 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
19125 return DAG.getNOT(DL, Res, Res.getValueType());
19130 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
19131 // widen the cmov and push the truncate through. This avoids introducing a new
19132 // branch during isel and doesn't add any extensions.
19133 if (Op.getValueType() == MVT::i8 &&
19134 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
19135 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
19136 if (T1.getValueType() == T2.getValueType() &&
19137 // Blacklist CopyFromReg to avoid partial register stalls.
19138 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
19139 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
19141 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19145 // Promote i16 cmovs if it won't prevent folding a load.
19146 if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
19147 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
19148 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
19149 SDValue Ops[] = { Op2, Op1, CC, Cond };
19150 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
19151 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19154 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
19155 // condition is true.
19156 SDValue Ops[] = { Op2, Op1, CC, Cond };
19157 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
19160 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
19161 const X86Subtarget &Subtarget,
19162 SelectionDAG &DAG) {
19163 MVT VT = Op->getSimpleValueType(0);
19164 SDValue In = Op->getOperand(0);
19165 MVT InVT = In.getSimpleValueType();
19166 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
19167 MVT VTElt = VT.getVectorElementType();
19170 unsigned NumElts = VT.getVectorNumElements();
19172 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
19174 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
19175 // If v16i32 is to be avoided, we'll need to split and concatenate.
19176 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19177 return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
19179 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19182 // Widen to 512-bits if VLX is not supported.
19183 MVT WideVT = ExtVT;
19184 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19185 NumElts *= 512 / ExtVT.getSizeInBits();
19186 InVT = MVT::getVectorVT(MVT::i1, NumElts);
19187 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
19188 In, DAG.getIntPtrConstant(0, dl));
19189 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
19193 MVT WideEltVT = WideVT.getVectorElementType();
19194 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
19195 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
19196 V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
19198 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
19199 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
19200 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
19203 // Truncate if we had to extend i16/i8 above.
19205 WideVT = MVT::getVectorVT(VTElt, NumElts);
19206 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
19209 // Extract back to 128/256-bit if we widened.
19211 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
19212 DAG.getIntPtrConstant(0, dl));
19217 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19218 SelectionDAG &DAG) {
19219 SDValue In = Op->getOperand(0);
19220 MVT InVT = In.getSimpleValueType();
19222 if (InVT.getVectorElementType() == MVT::i1)
19223 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19225 assert(Subtarget.hasAVX() && "Expected AVX support");
19226 return LowerAVXExtend(Op, DAG, Subtarget);
19229 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
19230 // For sign extend this needs to handle all vector sizes and SSE4.1 and
19231 // non-SSE4.1 targets. For zero extend this should only handle inputs of
19232 // MVT::v64i8 when BWI is not supported, but AVX512 is.
19233 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
19234 const X86Subtarget &Subtarget,
19235 SelectionDAG &DAG) {
19236 SDValue In = Op->getOperand(0);
19237 MVT VT = Op->getSimpleValueType(0);
19238 MVT InVT = In.getSimpleValueType();
19239 assert(VT.getSizeInBits() == InVT.getSizeInBits());
19241 MVT SVT = VT.getVectorElementType();
19242 MVT InSVT = InVT.getVectorElementType();
19243 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
19245 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
19247 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
19249 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
19250 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
19251 !(VT.is512BitVector() && Subtarget.hasAVX512()))
19256 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
19257 // For 512-bit vectors, we need 128-bits or 256-bits.
19258 if (VT.getSizeInBits() > 128) {
19259 // Input needs to be at least the same number of elements as output, and
19260 // at least 128-bits.
19261 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
19262 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
19265 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
19266 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
19268 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
19269 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
19270 // need to be handled here for 256/512-bit results.
19271 if (Subtarget.hasInt256()) {
19272 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
19273 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
19274 X86ISD::VSEXT : X86ISD::VZEXT;
19275 return DAG.getNode(ExtOpc, dl, VT, In);
19278 // We should only get here for sign extend.
19279 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
19280 "Unexpected opcode!");
19282 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
19286 // As SRAI is only available on i16/i32 types, we expand only up to i32
19287 // and handle i64 separately.
19288 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
19289 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
19290 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
19291 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
19292 Curr = DAG.getBitcast(CurrVT, Curr);
19295 SDValue SignExt = Curr;
19296 if (CurrVT != InVT) {
19297 unsigned SignExtShift =
19298 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
19299 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19300 DAG.getConstant(SignExtShift, dl, MVT::i8));
19306 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
19307 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19308 DAG.getConstant(31, dl, MVT::i8));
19309 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
19310 return DAG.getBitcast(VT, Ext);
19316 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19317 SelectionDAG &DAG) {
19318 MVT VT = Op->getSimpleValueType(0);
19319 SDValue In = Op->getOperand(0);
19320 MVT InVT = In.getSimpleValueType();
19323 if (InVT.getVectorElementType() == MVT::i1)
19324 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19326 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
19327 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
19328 "Expected same number of elements");
19329 assert((VT.getVectorElementType() == MVT::i16 ||
19330 VT.getVectorElementType() == MVT::i32 ||
19331 VT.getVectorElementType() == MVT::i64) &&
19332 "Unexpected element type");
19333 assert((InVT.getVectorElementType() == MVT::i8 ||
19334 InVT.getVectorElementType() == MVT::i16 ||
19335 InVT.getVectorElementType() == MVT::i32) &&
19336 "Unexpected element type");
19338 if (Subtarget.hasInt256())
19339 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
19341 // Optimize vectors in AVX mode
19342 // Sign extend v8i16 to v8i32 and
19345 // Divide input vector into two parts
19346 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
19347 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
19348 // concat the vectors to original VT
19350 unsigned NumElems = InVT.getVectorNumElements();
19351 SDValue Undef = DAG.getUNDEF(InVT);
19353 SmallVector<int,8> ShufMask1(NumElems, -1);
19354 for (unsigned i = 0; i != NumElems/2; ++i)
19357 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
19359 SmallVector<int,8> ShufMask2(NumElems, -1);
19360 for (unsigned i = 0; i != NumElems/2; ++i)
19361 ShufMask2[i] = i + NumElems/2;
19363 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
19365 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
19366 VT.getVectorNumElements() / 2);
19368 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
19369 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
19371 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19374 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
19375 SelectionDAG &DAG) {
19376 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
19378 SDValue StoredVal = St->getValue();
19380 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19381 assert(StoredVal.getValueType().isVector() &&
19382 StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
19383 StoredVal.getValueType().getVectorNumElements() <= 8 &&
19385 assert(!St->isTruncatingStore() && "Expected non-truncating store");
19386 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19387 "Expected AVX512F without AVX512DQI");
19389 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
19390 DAG.getUNDEF(MVT::v8i1), StoredVal,
19391 DAG.getIntPtrConstant(0, dl));
19392 StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
19394 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
19395 St->getPointerInfo(), St->getAlignment(),
19396 St->getMemOperand()->getFlags());
19399 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
19400 // may emit an illegal shuffle but the expansion is still better than scalar
19401 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
19402 // we'll emit a shuffle and a arithmetic shift.
19403 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
19404 // TODO: It is possible to support ZExt by zeroing the undef values during
19405 // the shuffle phase or after the shuffle.
19406 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
19407 SelectionDAG &DAG) {
19408 MVT RegVT = Op.getSimpleValueType();
19409 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
19410 assert(RegVT.isInteger() &&
19411 "We only custom lower integer vector sext loads.");
19413 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
19415 EVT MemVT = Ld->getMemoryVT();
19417 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19418 if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
19419 assert(EVT(RegVT) == MemVT && "Expected non-extending load");
19420 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
19421 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19422 "Expected AVX512F without AVX512DQI");
19424 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
19425 Ld->getPointerInfo(), Ld->getAlignment(),
19426 Ld->getMemOperand()->getFlags());
19428 // Replace chain users with the new chain.
19429 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
19430 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
19432 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
19433 DAG.getBitcast(MVT::v8i1, NewLd),
19434 DAG.getIntPtrConstant(0, dl));
19435 return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
19438 // Nothing useful we can do without SSE2 shuffles.
19439 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
19441 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19442 unsigned RegSz = RegVT.getSizeInBits();
19444 ISD::LoadExtType Ext = Ld->getExtensionType();
19446 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
19447 && "Only anyext and sext are currently implemented.");
19448 assert(MemVT != RegVT && "Cannot extend to the same type");
19449 assert(MemVT.isVector() && "Must load a vector from memory");
19451 unsigned NumElems = RegVT.getVectorNumElements();
19452 unsigned MemSz = MemVT.getSizeInBits();
19453 assert(RegSz > MemSz && "Register size must be greater than the mem size");
19455 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
19456 // The only way in which we have a legal 256-bit vector result but not the
19457 // integer 256-bit operations needed to directly lower a sextload is if we
19458 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
19459 // a 128-bit vector and a normal sign_extend to 256-bits that should get
19460 // correctly legalized. We do this late to allow the canonical form of
19461 // sextload to persist throughout the rest of the DAG combiner -- it wants
19462 // to fold together any extensions it can, and so will fuse a sign_extend
19463 // of an sextload into a sextload targeting a wider value.
19465 if (MemSz == 128) {
19466 // Just switch this to a normal load.
19467 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
19468 "it must be a legal 128-bit vector "
19470 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
19471 Ld->getPointerInfo(), Ld->getAlignment(),
19472 Ld->getMemOperand()->getFlags());
19474 assert(MemSz < 128 &&
19475 "Can't extend a type wider than 128 bits to a 256 bit vector!");
19476 // Do an sext load to a 128-bit vector type. We want to use the same
19477 // number of elements, but elements half as wide. This will end up being
19478 // recursively lowered by this routine, but will succeed as we definitely
19479 // have all the necessary features if we're using AVX1.
19481 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
19482 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
19484 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
19485 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
19486 Ld->getMemOperand()->getFlags());
19489 // Replace chain users with the new chain.
19490 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
19491 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
19493 // Finally, do a normal sign-extend to the desired register.
19494 return DAG.getSExtOrTrunc(Load, dl, RegVT);
19497 // All sizes must be a power of two.
19498 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
19499 "Non-power-of-two elements are not custom lowered!");
19501 // Attempt to load the original value using scalar loads.
19502 // Find the largest scalar type that divides the total loaded size.
19503 MVT SclrLoadTy = MVT::i8;
19504 for (MVT Tp : MVT::integer_valuetypes()) {
19505 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
19510 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
19511 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
19513 SclrLoadTy = MVT::f64;
19515 // Calculate the number of scalar loads that we need to perform
19516 // in order to load our vector from memory.
19517 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
19519 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
19520 "Can only lower sext loads with a single scalar load!");
19522 unsigned loadRegZize = RegSz;
19523 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
19526 // If we don't have BWI we won't be able to create the shuffle needed for
19528 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19529 MemVT == MVT::v8i8)
19532 // Represent our vector as a sequence of elements which are the
19533 // largest scalar that we can load.
19534 EVT LoadUnitVecVT = EVT::getVectorVT(
19535 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
19537 // Represent the data using the same element type that is stored in
19538 // memory. In practice, we ''widen'' MemVT.
19540 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19541 loadRegZize / MemVT.getScalarSizeInBits());
19543 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
19544 "Invalid vector type");
19546 // We can't shuffle using an illegal type.
19547 assert(TLI.isTypeLegal(WideVecVT) &&
19548 "We only lower types that form legal widened vector types");
19550 SmallVector<SDValue, 8> Chains;
19551 SDValue Ptr = Ld->getBasePtr();
19552 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
19553 TLI.getPointerTy(DAG.getDataLayout()));
19554 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
19556 for (unsigned i = 0; i < NumLoads; ++i) {
19557 // Perform a single load.
19558 SDValue ScalarLoad =
19559 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
19560 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
19561 Chains.push_back(ScalarLoad.getValue(1));
19562 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
19563 // another round of DAGCombining.
19565 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
19567 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
19568 ScalarLoad, DAG.getIntPtrConstant(i, dl));
19570 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
19573 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
19575 // Bitcast the loaded value to a vector of the original element type, in
19576 // the size of the target vector type.
19577 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
19578 unsigned SizeRatio = RegSz / MemSz;
19580 if (Ext == ISD::SEXTLOAD) {
19581 // If we have SSE4.1, we can directly emit a VSEXT node.
19582 if (Subtarget.hasSSE41()) {
19583 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
19584 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19588 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
19590 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
19591 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
19593 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
19594 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19598 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19599 MemVT == MVT::v8i8) {
19600 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
19601 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19605 // Redistribute the loaded elements into the different locations.
19606 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
19607 for (unsigned i = 0; i != NumElems; ++i)
19608 ShuffleVec[i * SizeRatio] = i;
19610 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
19611 DAG.getUNDEF(WideVecVT), ShuffleVec);
19613 // Bitcast to the requested type.
19614 Shuff = DAG.getBitcast(RegVT, Shuff);
19615 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19619 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
19620 /// each of which has no other use apart from the AND / OR.
19621 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
19622 Opc = Op.getOpcode();
19623 if (Opc != ISD::OR && Opc != ISD::AND)
19625 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19626 Op.getOperand(0).hasOneUse() &&
19627 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
19628 Op.getOperand(1).hasOneUse());
19631 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
19632 /// SETCC node has a single use.
19633 static bool isXor1OfSetCC(SDValue Op) {
19634 if (Op.getOpcode() != ISD::XOR)
19636 if (isOneConstant(Op.getOperand(1)))
19637 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19638 Op.getOperand(0).hasOneUse();
19642 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
19643 bool addTest = true;
19644 SDValue Chain = Op.getOperand(0);
19645 SDValue Cond = Op.getOperand(1);
19646 SDValue Dest = Op.getOperand(2);
19649 bool Inverted = false;
19651 if (Cond.getOpcode() == ISD::SETCC) {
19652 // Check for setcc([su]{add,sub,mul}o == 0).
19653 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
19654 isNullConstant(Cond.getOperand(1)) &&
19655 Cond.getOperand(0).getResNo() == 1 &&
19656 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
19657 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
19658 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
19659 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
19660 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
19661 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
19663 Cond = Cond.getOperand(0);
19665 if (SDValue NewCond = LowerSETCC(Cond, DAG))
19670 // FIXME: LowerXALUO doesn't handle these!!
19671 else if (Cond.getOpcode() == X86ISD::ADD ||
19672 Cond.getOpcode() == X86ISD::SUB ||
19673 Cond.getOpcode() == X86ISD::SMUL ||
19674 Cond.getOpcode() == X86ISD::UMUL)
19675 Cond = LowerXALUO(Cond, DAG);
19678 // Look pass (and (setcc_carry (cmp ...)), 1).
19679 if (Cond.getOpcode() == ISD::AND &&
19680 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19681 isOneConstant(Cond.getOperand(1)))
19682 Cond = Cond.getOperand(0);
19684 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19685 // setting operand in place of the X86ISD::SETCC.
19686 unsigned CondOpcode = Cond.getOpcode();
19687 if (CondOpcode == X86ISD::SETCC ||
19688 CondOpcode == X86ISD::SETCC_CARRY) {
19689 CC = Cond.getOperand(0);
19691 SDValue Cmp = Cond.getOperand(1);
19692 unsigned Opc = Cmp.getOpcode();
19693 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19694 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19698 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19702 // These can only come from an arithmetic instruction with overflow,
19703 // e.g. SADDO, UADDO.
19704 Cond = Cond.getOperand(1);
19710 CondOpcode = Cond.getOpcode();
19711 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19712 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19713 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19714 Cond.getOperand(0).getValueType() != MVT::i8)) {
19715 SDValue LHS = Cond.getOperand(0);
19716 SDValue RHS = Cond.getOperand(1);
19717 unsigned X86Opcode;
19720 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19721 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19723 switch (CondOpcode) {
19724 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19726 if (isOneConstant(RHS)) {
19727 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19730 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19731 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19733 if (isOneConstant(RHS)) {
19734 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19737 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19738 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19739 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19740 default: llvm_unreachable("unexpected overflowing operator");
19743 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19744 if (CondOpcode == ISD::UMULO)
19745 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19748 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19750 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19752 if (CondOpcode == ISD::UMULO)
19753 Cond = X86Op.getValue(2);
19755 Cond = X86Op.getValue(1);
19757 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19761 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19762 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19763 if (CondOpc == ISD::OR) {
19764 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19765 // two branches instead of an explicit OR instruction with a
19767 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19768 isX86LogicalCmp(Cmp)) {
19769 CC = Cond.getOperand(0).getOperand(0);
19770 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19771 Chain, Dest, CC, Cmp);
19772 CC = Cond.getOperand(1).getOperand(0);
19776 } else { // ISD::AND
19777 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19778 // two branches instead of an explicit AND instruction with a
19779 // separate test. However, we only do this if this block doesn't
19780 // have a fall-through edge, because this requires an explicit
19781 // jmp when the condition is false.
19782 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19783 isX86LogicalCmp(Cmp) &&
19784 Op.getNode()->hasOneUse()) {
19785 X86::CondCode CCode =
19786 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19787 CCode = X86::GetOppositeBranchCondition(CCode);
19788 CC = DAG.getConstant(CCode, dl, MVT::i8);
19789 SDNode *User = *Op.getNode()->use_begin();
19790 // Look for an unconditional branch following this conditional branch.
19791 // We need this because we need to reverse the successors in order
19792 // to implement FCMP_OEQ.
19793 if (User->getOpcode() == ISD::BR) {
19794 SDValue FalseBB = User->getOperand(1);
19796 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19797 assert(NewBR == User);
19801 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19802 Chain, Dest, CC, Cmp);
19803 X86::CondCode CCode =
19804 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19805 CCode = X86::GetOppositeBranchCondition(CCode);
19806 CC = DAG.getConstant(CCode, dl, MVT::i8);
19812 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19813 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19814 // It should be transformed during dag combiner except when the condition
19815 // is set by a arithmetics with overflow node.
19816 X86::CondCode CCode =
19817 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19818 CCode = X86::GetOppositeBranchCondition(CCode);
19819 CC = DAG.getConstant(CCode, dl, MVT::i8);
19820 Cond = Cond.getOperand(0).getOperand(1);
19822 } else if (Cond.getOpcode() == ISD::SETCC &&
19823 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19824 // For FCMP_OEQ, we can emit
19825 // two branches instead of an explicit AND instruction with a
19826 // separate test. However, we only do this if this block doesn't
19827 // have a fall-through edge, because this requires an explicit
19828 // jmp when the condition is false.
19829 if (Op.getNode()->hasOneUse()) {
19830 SDNode *User = *Op.getNode()->use_begin();
19831 // Look for an unconditional branch following this conditional branch.
19832 // We need this because we need to reverse the successors in order
19833 // to implement FCMP_OEQ.
19834 if (User->getOpcode() == ISD::BR) {
19835 SDValue FalseBB = User->getOperand(1);
19837 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19838 assert(NewBR == User);
19842 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19843 Cond.getOperand(0), Cond.getOperand(1));
19844 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19845 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19846 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19847 Chain, Dest, CC, Cmp);
19848 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19853 } else if (Cond.getOpcode() == ISD::SETCC &&
19854 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19855 // For FCMP_UNE, we can emit
19856 // two branches instead of an explicit AND instruction with a
19857 // separate test. However, we only do this if this block doesn't
19858 // have a fall-through edge, because this requires an explicit
19859 // jmp when the condition is false.
19860 if (Op.getNode()->hasOneUse()) {
19861 SDNode *User = *Op.getNode()->use_begin();
19862 // Look for an unconditional branch following this conditional branch.
19863 // We need this because we need to reverse the successors in order
19864 // to implement FCMP_UNE.
19865 if (User->getOpcode() == ISD::BR) {
19866 SDValue FalseBB = User->getOperand(1);
19868 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19869 assert(NewBR == User);
19872 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19873 Cond.getOperand(0), Cond.getOperand(1));
19874 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19875 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19876 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19877 Chain, Dest, CC, Cmp);
19878 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19888 // Look pass the truncate if the high bits are known zero.
19889 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19890 Cond = Cond.getOperand(0);
19892 // We know the result of AND is compared against zero. Try to match
19894 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19895 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19896 CC = NewSetCC.getOperand(0);
19897 Cond = NewSetCC.getOperand(1);
19904 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19905 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19906 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19908 Cond = ConvertCmpIfNecessary(Cond, DAG);
19909 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19910 Chain, Dest, CC, Cond);
19913 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19914 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19915 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19916 // that the guard pages used by the OS virtual memory manager are allocated in
19917 // correct sequence.
19919 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19920 SelectionDAG &DAG) const {
19921 MachineFunction &MF = DAG.getMachineFunction();
19922 bool SplitStack = MF.shouldSplitStack();
19923 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19924 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19925 SplitStack || EmitStackProbe;
19929 SDNode *Node = Op.getNode();
19930 SDValue Chain = Op.getOperand(0);
19931 SDValue Size = Op.getOperand(1);
19932 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19933 EVT VT = Node->getValueType(0);
19935 // Chain the dynamic stack allocation so that it doesn't modify the stack
19936 // pointer when other instructions are using the stack.
19937 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19939 bool Is64Bit = Subtarget.is64Bit();
19940 MVT SPTy = getPointerTy(DAG.getDataLayout());
19944 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19945 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19946 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19947 " not tell us which reg is the stack pointer!");
19949 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19950 Chain = SP.getValue(1);
19951 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19952 unsigned StackAlign = TFI.getStackAlignment();
19953 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19954 if (Align > StackAlign)
19955 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19956 DAG.getConstant(-(uint64_t)Align, dl, VT));
19957 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19958 } else if (SplitStack) {
19959 MachineRegisterInfo &MRI = MF.getRegInfo();
19962 // The 64 bit implementation of segmented stacks needs to clobber both r10
19963 // r11. This makes it impossible to use it along with nested parameters.
19964 const Function &F = MF.getFunction();
19965 for (const auto &A : F.args()) {
19966 if (A.hasNestAttr())
19967 report_fatal_error("Cannot use segmented stacks with functions that "
19968 "have nested arguments.");
19972 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19973 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19974 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19975 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19976 DAG.getRegister(Vreg, SPTy));
19978 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19979 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19980 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19982 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19983 unsigned SPReg = RegInfo->getStackRegister();
19984 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19985 Chain = SP.getValue(1);
19988 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19989 DAG.getConstant(-(uint64_t)Align, dl, VT));
19990 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19996 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19997 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19999 SDValue Ops[2] = {Result, Chain};
20000 return DAG.getMergeValues(Ops, dl);
20003 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
20004 MachineFunction &MF = DAG.getMachineFunction();
20005 auto PtrVT = getPointerTy(MF.getDataLayout());
20006 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20008 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
20011 if (!Subtarget.is64Bit() ||
20012 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
20013 // vastart just stores the address of the VarArgsFrameIndex slot into the
20014 // memory location argument.
20015 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
20016 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
20017 MachinePointerInfo(SV));
20021 // gp_offset (0 - 6 * 8)
20022 // fp_offset (48 - 48 + 8 * 16)
20023 // overflow_arg_area (point to parameters coming in memory).
20025 SmallVector<SDValue, 8> MemOps;
20026 SDValue FIN = Op.getOperand(1);
20028 SDValue Store = DAG.getStore(
20029 Op.getOperand(0), DL,
20030 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
20031 MachinePointerInfo(SV));
20032 MemOps.push_back(Store);
20035 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
20036 Store = DAG.getStore(
20037 Op.getOperand(0), DL,
20038 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
20039 MachinePointerInfo(SV, 4));
20040 MemOps.push_back(Store);
20042 // Store ptr to overflow_arg_area
20043 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
20044 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
20046 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
20047 MemOps.push_back(Store);
20049 // Store ptr to reg_save_area.
20050 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
20051 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
20052 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
20053 Store = DAG.getStore(
20054 Op.getOperand(0), DL, RSFIN, FIN,
20055 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
20056 MemOps.push_back(Store);
20057 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
20060 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
20061 assert(Subtarget.is64Bit() &&
20062 "LowerVAARG only handles 64-bit va_arg!");
20063 assert(Op.getNumOperands() == 4);
20065 MachineFunction &MF = DAG.getMachineFunction();
20066 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
20067 // The Win64 ABI uses char* instead of a structure.
20068 return DAG.expandVAArg(Op.getNode());
20070 SDValue Chain = Op.getOperand(0);
20071 SDValue SrcPtr = Op.getOperand(1);
20072 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
20073 unsigned Align = Op.getConstantOperandVal(3);
20076 EVT ArgVT = Op.getNode()->getValueType(0);
20077 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20078 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
20081 // Decide which area this value should be read from.
20082 // TODO: Implement the AMD64 ABI in its entirety. This simple
20083 // selection mechanism works only for the basic types.
20084 if (ArgVT == MVT::f80) {
20085 llvm_unreachable("va_arg for f80 not yet implemented");
20086 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
20087 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
20088 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
20089 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
20091 llvm_unreachable("Unhandled argument type in LowerVAARG");
20094 if (ArgMode == 2) {
20095 // Sanity Check: Make sure using fp_offset makes sense.
20096 assert(!Subtarget.useSoftFloat() &&
20097 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
20098 Subtarget.hasSSE1());
20101 // Insert VAARG_64 node into the DAG
20102 // VAARG_64 returns two values: Variable Argument Address, Chain
20103 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
20104 DAG.getConstant(ArgMode, dl, MVT::i8),
20105 DAG.getConstant(Align, dl, MVT::i32)};
20106 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
20107 SDValue VAARG = DAG.getMemIntrinsicNode(
20108 X86ISD::VAARG_64, dl,
20109 VTs, InstOps, MVT::i64,
20110 MachinePointerInfo(SV),
20112 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
20113 Chain = VAARG.getValue(1);
20115 // Load the next argument and return it
20116 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
20119 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
20120 SelectionDAG &DAG) {
20121 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
20122 // where a va_list is still an i8*.
20123 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
20124 if (Subtarget.isCallingConvWin64(
20125 DAG.getMachineFunction().getFunction().getCallingConv()))
20126 // Probably a Win64 va_copy.
20127 return DAG.expandVACopy(Op.getNode());
20129 SDValue Chain = Op.getOperand(0);
20130 SDValue DstPtr = Op.getOperand(1);
20131 SDValue SrcPtr = Op.getOperand(2);
20132 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
20133 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20136 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
20137 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
20139 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
20142 /// Handle vector element shifts where the shift amount is a constant.
20143 /// Takes immediate version of shift as input.
20144 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
20145 SDValue SrcOp, uint64_t ShiftAmt,
20146 SelectionDAG &DAG) {
20147 MVT ElementType = VT.getVectorElementType();
20149 // Bitcast the source vector to the output type, this is mainly necessary for
20150 // vXi8/vXi64 shifts.
20151 if (VT != SrcOp.getSimpleValueType())
20152 SrcOp = DAG.getBitcast(VT, SrcOp);
20154 // Fold this packed shift into its first operand if ShiftAmt is 0.
20158 // Check for ShiftAmt >= element width
20159 if (ShiftAmt >= ElementType.getSizeInBits()) {
20160 if (Opc == X86ISD::VSRAI)
20161 ShiftAmt = ElementType.getSizeInBits() - 1;
20163 return DAG.getConstant(0, dl, VT);
20166 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
20167 && "Unknown target vector shift-by-constant node");
20169 // Fold this packed vector shift into a build vector if SrcOp is a
20170 // vector of Constants or UNDEFs.
20171 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
20172 SmallVector<SDValue, 8> Elts;
20173 unsigned NumElts = SrcOp->getNumOperands();
20174 ConstantSDNode *ND;
20177 default: llvm_unreachable("Unknown opcode!");
20178 case X86ISD::VSHLI:
20179 for (unsigned i=0; i!=NumElts; ++i) {
20180 SDValue CurrentOp = SrcOp->getOperand(i);
20181 if (CurrentOp->isUndef()) {
20182 Elts.push_back(CurrentOp);
20185 ND = cast<ConstantSDNode>(CurrentOp);
20186 const APInt &C = ND->getAPIntValue();
20187 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
20190 case X86ISD::VSRLI:
20191 for (unsigned i=0; i!=NumElts; ++i) {
20192 SDValue CurrentOp = SrcOp->getOperand(i);
20193 if (CurrentOp->isUndef()) {
20194 Elts.push_back(CurrentOp);
20197 ND = cast<ConstantSDNode>(CurrentOp);
20198 const APInt &C = ND->getAPIntValue();
20199 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
20202 case X86ISD::VSRAI:
20203 for (unsigned i=0; i!=NumElts; ++i) {
20204 SDValue CurrentOp = SrcOp->getOperand(i);
20205 if (CurrentOp->isUndef()) {
20206 Elts.push_back(CurrentOp);
20209 ND = cast<ConstantSDNode>(CurrentOp);
20210 const APInt &C = ND->getAPIntValue();
20211 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
20216 return DAG.getBuildVector(VT, dl, Elts);
20219 return DAG.getNode(Opc, dl, VT, SrcOp,
20220 DAG.getConstant(ShiftAmt, dl, MVT::i8));
20223 /// Handle vector element shifts where the shift amount may or may not be a
20224 /// constant. Takes immediate version of shift as input.
20225 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
20226 SDValue SrcOp, SDValue ShAmt,
20227 const X86Subtarget &Subtarget,
20228 SelectionDAG &DAG) {
20229 MVT SVT = ShAmt.getSimpleValueType();
20230 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
20232 // Catch shift-by-constant.
20233 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
20234 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
20235 CShAmt->getZExtValue(), DAG);
20237 // Change opcode to non-immediate version
20239 default: llvm_unreachable("Unknown target vector shift node");
20240 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
20241 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
20242 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
20245 // Need to build a vector containing shift amount.
20246 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
20247 // +=================+============+=======================================+
20248 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
20249 // +=================+============+=======================================+
20250 // | i64 | Yes, No | Use ShAmt as lowest elt |
20251 // | i32 | Yes | zero-extend in-reg |
20252 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
20253 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
20254 // +=================+============+=======================================+
20256 if (SVT == MVT::i64)
20257 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
20258 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
20259 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
20260 ShAmt = ShAmt.getOperand(0);
20261 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
20262 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20263 } else if (Subtarget.hasSSE41() &&
20264 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
20265 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
20266 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20268 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
20269 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
20270 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
20273 // The return type has to be a 128-bit type with the same element
20274 // type as the input type.
20275 MVT EltVT = VT.getVectorElementType();
20276 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
20278 ShAmt = DAG.getBitcast(ShVT, ShAmt);
20279 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
20282 /// Return Mask with the necessary casting or extending
20283 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
20284 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
20285 const X86Subtarget &Subtarget, SelectionDAG &DAG,
20288 if (isAllOnesConstant(Mask))
20289 return DAG.getConstant(1, dl, MaskVT);
20290 if (X86::isZeroNode(Mask))
20291 return DAG.getConstant(0, dl, MaskVT);
20293 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
20294 // Mask should be extended
20295 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
20296 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
20299 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
20300 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
20301 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
20302 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
20304 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20305 DAG.getConstant(0, dl, MVT::i32));
20306 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20307 DAG.getConstant(1, dl, MVT::i32));
20309 Lo = DAG.getBitcast(MVT::v32i1, Lo);
20310 Hi = DAG.getBitcast(MVT::v32i1, Hi);
20312 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
20314 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20315 Mask.getSimpleValueType().getSizeInBits());
20316 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
20317 // are extracted by EXTRACT_SUBVECTOR.
20318 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
20319 DAG.getBitcast(BitcastVT, Mask),
20320 DAG.getIntPtrConstant(0, dl));
20324 /// Return (and \p Op, \p Mask) for compare instructions or
20325 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
20326 /// necessary casting or extending for \p Mask when lowering masking intrinsics
20327 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
20328 SDValue PreservedSrc,
20329 const X86Subtarget &Subtarget,
20330 SelectionDAG &DAG) {
20331 MVT VT = Op.getSimpleValueType();
20332 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20333 unsigned OpcodeSelect = ISD::VSELECT;
20336 if (isAllOnesConstant(Mask))
20339 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20341 switch (Op.getOpcode()) {
20344 case X86ISD::CMPM_RND:
20345 case X86ISD::VPSHUFBITQMB:
20346 case X86ISD::VFPCLASS:
20347 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
20348 case ISD::TRUNCATE:
20349 case X86ISD::VTRUNC:
20350 case X86ISD::VTRUNCS:
20351 case X86ISD::VTRUNCUS:
20352 case X86ISD::CVTPS2PH:
20353 // We can't use ISD::VSELECT here because it is not always "Legal"
20354 // for the destination type. For example vpmovqb require only AVX512
20355 // and vselect that can operate on byte element type require BWI
20356 OpcodeSelect = X86ISD::SELECT;
20359 if (PreservedSrc.isUndef())
20360 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20361 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
20364 /// Creates an SDNode for a predicated scalar operation.
20365 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
20366 /// The mask is coming as MVT::i8 and it should be transformed
20367 /// to MVT::v1i1 while lowering masking intrinsics.
20368 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
20369 /// "X86select" instead of "vselect". We just can't create the "vselect" node
20370 /// for a scalar instruction.
20371 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
20372 SDValue PreservedSrc,
20373 const X86Subtarget &Subtarget,
20374 SelectionDAG &DAG) {
20376 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
20377 if (MaskConst->getZExtValue() & 0x1)
20380 MVT VT = Op.getSimpleValueType();
20383 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
20384 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
20385 if (Op.getOpcode() == X86ISD::FSETCCM ||
20386 Op.getOpcode() == X86ISD::FSETCCM_RND ||
20387 Op.getOpcode() == X86ISD::VFPCLASSS)
20388 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
20390 if (PreservedSrc.isUndef())
20391 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20392 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
20395 static int getSEHRegistrationNodeSize(const Function *Fn) {
20396 if (!Fn->hasPersonalityFn())
20397 report_fatal_error(
20398 "querying registration node size for function without personality");
20399 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
20400 // WinEHStatePass for the full struct definition.
20401 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
20402 case EHPersonality::MSVC_X86SEH: return 24;
20403 case EHPersonality::MSVC_CXX: return 16;
20406 report_fatal_error(
20407 "can only recover FP for 32-bit MSVC EH personality functions");
20410 /// When the MSVC runtime transfers control to us, either to an outlined
20411 /// function or when returning to a parent frame after catching an exception, we
20412 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
20413 /// Here's the math:
20414 /// RegNodeBase = EntryEBP - RegNodeSize
20415 /// ParentFP = RegNodeBase - ParentFrameOffset
20416 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
20417 /// subtracting the offset (negative on x86) takes us back to the parent FP.
20418 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
20419 SDValue EntryEBP) {
20420 MachineFunction &MF = DAG.getMachineFunction();
20423 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20424 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20426 // It's possible that the parent function no longer has a personality function
20427 // if the exceptional code was optimized away, in which case we just return
20428 // the incoming EBP.
20429 if (!Fn->hasPersonalityFn())
20432 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
20433 // registration, or the .set_setframe offset.
20434 MCSymbol *OffsetSym =
20435 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
20436 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20437 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
20438 SDValue ParentFrameOffset =
20439 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
20441 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
20442 // prologue to RBP in the parent function.
20443 const X86Subtarget &Subtarget =
20444 static_cast<const X86Subtarget &>(DAG.getSubtarget());
20445 if (Subtarget.is64Bit())
20446 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
20448 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
20449 // RegNodeBase = EntryEBP - RegNodeSize
20450 // ParentFP = RegNodeBase - ParentFrameOffset
20451 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
20452 DAG.getConstant(RegNodeSize, dl, PtrVT));
20453 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
20456 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
20457 SelectionDAG &DAG) const {
20458 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
20459 auto isRoundModeCurDirection = [](SDValue Rnd) {
20460 if (!isa<ConstantSDNode>(Rnd))
20463 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
20464 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
20468 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20469 MVT VT = Op.getSimpleValueType();
20470 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
20472 switch(IntrData->Type) {
20473 case INTR_TYPE_1OP: {
20474 // We specify 2 possible opcodes for intrinsics with rounding modes.
20475 // First, we check if the intrinsic may have non-default rounding mode,
20476 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20477 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20478 if (IntrWithRoundingModeOpcode != 0) {
20479 SDValue Rnd = Op.getOperand(2);
20480 if (!isRoundModeCurDirection(Rnd)) {
20481 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
20482 Op.getOperand(1), Rnd);
20485 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
20487 case INTR_TYPE_2OP:
20488 case INTR_TYPE_2OP_IMM8: {
20489 SDValue Src2 = Op.getOperand(2);
20491 if (IntrData->Type == INTR_TYPE_2OP_IMM8)
20492 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
20494 // We specify 2 possible opcodes for intrinsics with rounding modes.
20495 // First, we check if the intrinsic may have non-default rounding mode,
20496 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20497 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20498 if (IntrWithRoundingModeOpcode != 0) {
20499 SDValue Rnd = Op.getOperand(3);
20500 if (!isRoundModeCurDirection(Rnd)) {
20501 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
20502 Op.getOperand(1), Src2, Rnd);
20506 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20507 Op.getOperand(1), Src2);
20509 case INTR_TYPE_3OP:
20510 case INTR_TYPE_3OP_IMM8: {
20511 SDValue Src1 = Op.getOperand(1);
20512 SDValue Src2 = Op.getOperand(2);
20513 SDValue Src3 = Op.getOperand(3);
20515 if (IntrData->Type == INTR_TYPE_3OP_IMM8)
20516 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20518 // We specify 2 possible opcodes for intrinsics with rounding modes.
20519 // First, we check if the intrinsic may have non-default rounding mode,
20520 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20521 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20522 if (IntrWithRoundingModeOpcode != 0) {
20523 SDValue Rnd = Op.getOperand(4);
20524 if (!isRoundModeCurDirection(Rnd)) {
20525 return DAG.getNode(IntrWithRoundingModeOpcode,
20526 dl, Op.getValueType(),
20527 Src1, Src2, Src3, Rnd);
20531 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20534 case INTR_TYPE_4OP:
20535 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
20536 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
20537 case INTR_TYPE_1OP_MASK_RM: {
20538 SDValue Src = Op.getOperand(1);
20539 SDValue PassThru = Op.getOperand(2);
20540 SDValue Mask = Op.getOperand(3);
20541 SDValue RoundingMode;
20542 // We always add rounding mode to the Node.
20543 // If the rounding mode is not specified, we add the
20544 // "current direction" mode.
20545 if (Op.getNumOperands() == 4)
20547 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20549 RoundingMode = Op.getOperand(4);
20550 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
20551 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20553 Mask, PassThru, Subtarget, DAG);
20555 case INTR_TYPE_1OP_MASK: {
20556 SDValue Src = Op.getOperand(1);
20557 SDValue PassThru = Op.getOperand(2);
20558 SDValue Mask = Op.getOperand(3);
20559 // We add rounding mode to the Node when
20560 // - RM Opcode is specified and
20561 // - RM is not "current direction".
20562 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20563 if (IntrWithRoundingModeOpcode != 0) {
20564 SDValue Rnd = Op.getOperand(4);
20565 if (!isRoundModeCurDirection(Rnd)) {
20566 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20567 dl, Op.getValueType(),
20569 Mask, PassThru, Subtarget, DAG);
20572 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
20573 Mask, PassThru, Subtarget, DAG);
20575 case INTR_TYPE_SCALAR_MASK: {
20576 SDValue Src1 = Op.getOperand(1);
20577 SDValue Src2 = Op.getOperand(2);
20578 SDValue passThru = Op.getOperand(3);
20579 SDValue Mask = Op.getOperand(4);
20580 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20581 // There are 2 kinds of intrinsics in this group:
20582 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20583 // (2) With rounding mode and sae - 7 operands.
20584 bool HasRounding = IntrWithRoundingModeOpcode != 0;
20585 if (Op.getNumOperands() == (5U + HasRounding)) {
20587 SDValue Rnd = Op.getOperand(5);
20588 if (!isRoundModeCurDirection(Rnd))
20589 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20590 dl, VT, Src1, Src2, Rnd),
20591 Mask, passThru, Subtarget, DAG);
20593 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20595 Mask, passThru, Subtarget, DAG);
20598 assert(Op.getNumOperands() == (6U + HasRounding) &&
20599 "Unexpected intrinsic form");
20600 SDValue RoundingMode = Op.getOperand(5);
20602 SDValue Sae = Op.getOperand(6);
20603 if (!isRoundModeCurDirection(Sae))
20604 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20605 dl, VT, Src1, Src2,
20606 RoundingMode, Sae),
20607 Mask, passThru, Subtarget, DAG);
20609 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20610 Src2, RoundingMode),
20611 Mask, passThru, Subtarget, DAG);
20613 case INTR_TYPE_SCALAR_MASK_RM: {
20614 SDValue Src1 = Op.getOperand(1);
20615 SDValue Src2 = Op.getOperand(2);
20616 SDValue Src0 = Op.getOperand(3);
20617 SDValue Mask = Op.getOperand(4);
20618 // There are 2 kinds of intrinsics in this group:
20619 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20620 // (2) With rounding mode and sae - 7 operands.
20621 if (Op.getNumOperands() == 6) {
20622 SDValue Sae = Op.getOperand(5);
20623 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20625 Mask, Src0, Subtarget, DAG);
20627 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
20628 SDValue RoundingMode = Op.getOperand(5);
20629 SDValue Sae = Op.getOperand(6);
20630 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20631 RoundingMode, Sae),
20632 Mask, Src0, Subtarget, DAG);
20634 case INTR_TYPE_2OP_MASK: {
20635 SDValue Src1 = Op.getOperand(1);
20636 SDValue Src2 = Op.getOperand(2);
20637 SDValue PassThru = Op.getOperand(3);
20638 SDValue Mask = Op.getOperand(4);
20640 // We specify 2 possible opcodes for intrinsics with rounding modes.
20641 // First, we check if the intrinsic may have non-default rounding mode,
20642 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20643 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20644 if (IntrWithRoundingModeOpcode != 0) {
20645 SDValue Rnd = Op.getOperand(5);
20646 if (!isRoundModeCurDirection(Rnd)) {
20647 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20648 dl, Op.getValueType(),
20650 Mask, PassThru, Subtarget, DAG);
20653 // TODO: Intrinsics should have fast-math-flags to propagate.
20654 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
20655 Mask, PassThru, Subtarget, DAG);
20657 case INTR_TYPE_2OP_MASK_RM: {
20658 SDValue Src1 = Op.getOperand(1);
20659 SDValue Src2 = Op.getOperand(2);
20660 SDValue PassThru = Op.getOperand(3);
20661 SDValue Mask = Op.getOperand(4);
20662 // We specify 2 possible modes for intrinsics, with/without rounding
20664 // First, we check if the intrinsic have rounding mode (6 operands),
20665 // if not, we set rounding mode to "current".
20667 if (Op.getNumOperands() == 6)
20668 Rnd = Op.getOperand(5);
20670 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20671 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20673 Mask, PassThru, Subtarget, DAG);
20675 case INTR_TYPE_3OP_SCALAR_MASK: {
20676 SDValue Src1 = Op.getOperand(1);
20677 SDValue Src2 = Op.getOperand(2);
20678 SDValue Src3 = Op.getOperand(3);
20679 SDValue PassThru = Op.getOperand(4);
20680 SDValue Mask = Op.getOperand(5);
20682 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20683 if (IntrWithRoundingModeOpcode != 0) {
20684 SDValue Rnd = Op.getOperand(6);
20685 if (!isRoundModeCurDirection(Rnd))
20686 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20687 dl, VT, Src1, Src2, Src3, Rnd),
20688 Mask, PassThru, Subtarget, DAG);
20690 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20692 Mask, PassThru, Subtarget, DAG);
20694 case INTR_TYPE_3OP_MASK: {
20695 SDValue Src1 = Op.getOperand(1);
20696 SDValue Src2 = Op.getOperand(2);
20697 SDValue Src3 = Op.getOperand(3);
20698 SDValue PassThru = Op.getOperand(4);
20699 SDValue Mask = Op.getOperand(5);
20701 // We specify 2 possible opcodes for intrinsics with rounding modes.
20702 // First, we check if the intrinsic may have non-default rounding mode,
20703 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20704 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20705 if (IntrWithRoundingModeOpcode != 0) {
20706 SDValue Rnd = Op.getOperand(6);
20707 if (!isRoundModeCurDirection(Rnd)) {
20708 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20709 dl, Op.getValueType(),
20710 Src1, Src2, Src3, Rnd),
20711 Mask, PassThru, Subtarget, DAG);
20714 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20716 Mask, PassThru, Subtarget, DAG);
20719 SDValue Src1 = Op.getOperand(1);
20720 SDValue Src2 = Op.getOperand(2);
20722 // Swap Src1 and Src2 in the node creation
20723 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
20726 case FMA_OP_MASK: {
20727 SDValue Src1 = Op.getOperand(1);
20728 SDValue Src2 = Op.getOperand(2);
20729 SDValue Src3 = Op.getOperand(3);
20730 SDValue Mask = Op.getOperand(4);
20731 MVT VT = Op.getSimpleValueType();
20732 SDValue PassThru = SDValue();
20734 // set PassThru element
20735 if (IntrData->Type == FMA_OP_MASKZ)
20736 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20740 // We specify 2 possible opcodes for intrinsics with rounding modes.
20741 // First, we check if the intrinsic may have non-default rounding mode,
20742 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20743 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20744 if (IntrWithRoundingModeOpcode != 0) {
20745 SDValue Rnd = Op.getOperand(5);
20746 if (!isRoundModeCurDirection(Rnd))
20747 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20748 dl, Op.getValueType(),
20749 Src1, Src2, Src3, Rnd),
20750 Mask, PassThru, Subtarget, DAG);
20752 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20753 dl, Op.getValueType(),
20755 Mask, PassThru, Subtarget, DAG);
20758 // NOTE: We need to swizzle the operands to pass the multiply operands
20760 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20761 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
20763 // ISD::FP_ROUND has a second argument that indicates if the truncation
20764 // does not change the value. Set it to 0 since it can change.
20765 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20766 DAG.getIntPtrConstant(0, dl));
20767 case CVTPD2PS_MASK: {
20768 SDValue Src = Op.getOperand(1);
20769 SDValue PassThru = Op.getOperand(2);
20770 SDValue Mask = Op.getOperand(3);
20771 // We add rounding mode to the Node when
20772 // - RM Opcode is specified and
20773 // - RM is not "current direction".
20774 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20775 if (IntrWithRoundingModeOpcode != 0) {
20776 SDValue Rnd = Op.getOperand(4);
20777 if (!isRoundModeCurDirection(Rnd)) {
20778 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20779 dl, Op.getValueType(),
20781 Mask, PassThru, Subtarget, DAG);
20784 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20785 // ISD::FP_ROUND has a second argument that indicates if the truncation
20786 // does not change the value. Set it to 0 since it can change.
20787 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20788 DAG.getIntPtrConstant(0, dl)),
20789 Mask, PassThru, Subtarget, DAG);
20792 // FPclass intrinsics
20793 SDValue Src1 = Op.getOperand(1);
20794 MVT MaskVT = Op.getSimpleValueType();
20795 SDValue Imm = Op.getOperand(2);
20796 return DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20799 SDValue Src1 = Op.getOperand(1);
20800 SDValue Imm = Op.getOperand(2);
20801 SDValue Mask = Op.getOperand(3);
20802 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20803 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20805 // Need to fill with zeros to ensure the bitcast will produce zeroes
20806 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20807 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20808 DAG.getConstant(0, dl, MVT::v8i1),
20809 FPclassMask, DAG.getIntPtrConstant(0, dl));
20810 return DAG.getBitcast(MVT::i8, Ins);
20813 // Comparison intrinsics with masks.
20814 // Example of transformation:
20815 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20816 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20818 // (v8i1 (insert_subvector zero,
20819 // (v2i1 (and (PCMPEQM %a, %b),
20820 // (extract_subvector
20821 // (v8i1 (bitcast %mask)), 0))), 0))))
20822 MVT VT = Op.getOperand(1).getSimpleValueType();
20823 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20824 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20825 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20826 Mask.getSimpleValueType().getSizeInBits());
20827 SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20829 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20831 // Need to fill with zeros to ensure the bitcast will produce zeroes
20832 // for the upper bits in the v2i1/v4i1 case.
20833 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20834 DAG.getConstant(0, dl, BitcastVT),
20835 CmpMask, DAG.getIntPtrConstant(0, dl));
20836 return DAG.getBitcast(Op.getValueType(), Res);
20839 case CMP_MASK_CC: {
20840 MVT MaskVT = Op.getSimpleValueType();
20842 SDValue CC = Op.getOperand(3);
20843 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20844 // We specify 2 possible opcodes for intrinsics with rounding modes.
20845 // First, we check if the intrinsic may have non-default rounding mode,
20846 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20847 if (IntrData->Opc1 != 0) {
20848 SDValue Rnd = Op.getOperand(4);
20849 if (!isRoundModeCurDirection(Rnd))
20850 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20851 Op.getOperand(2), CC, Rnd);
20853 //default rounding mode
20854 if (!Cmp.getNode())
20855 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20856 Op.getOperand(2), CC);
20860 case CMP_MASK_SCALAR_CC: {
20861 SDValue Src1 = Op.getOperand(1);
20862 SDValue Src2 = Op.getOperand(2);
20863 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20864 SDValue Mask = Op.getOperand(4);
20867 if (IntrData->Opc1 != 0) {
20868 SDValue Rnd = Op.getOperand(5);
20869 if (!isRoundModeCurDirection(Rnd))
20870 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20872 //default rounding mode
20874 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20876 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20878 // Need to fill with zeros to ensure the bitcast will produce zeroes
20879 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20880 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20881 DAG.getConstant(0, dl, MVT::v8i1),
20882 CmpMask, DAG.getIntPtrConstant(0, dl));
20883 return DAG.getBitcast(MVT::i8, Ins);
20885 case COMI: { // Comparison intrinsics
20886 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20887 SDValue LHS = Op.getOperand(1);
20888 SDValue RHS = Op.getOperand(2);
20889 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20890 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20893 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20894 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20895 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20896 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20899 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20900 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20901 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20902 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20905 case ISD::SETGT: // (CF = 0 and ZF = 0)
20906 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20908 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20909 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20912 case ISD::SETGE: // CF = 0
20913 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20915 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20916 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20919 llvm_unreachable("Unexpected illegal condition!");
20921 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20923 case COMI_RM: { // Comparison intrinsics with Sae
20924 SDValue LHS = Op.getOperand(1);
20925 SDValue RHS = Op.getOperand(2);
20926 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20927 SDValue Sae = Op.getOperand(4);
20930 if (isRoundModeCurDirection(Sae))
20931 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20932 DAG.getConstant(CondVal, dl, MVT::i8));
20934 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20935 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20936 // Need to fill with zeros to ensure the bitcast will produce zeroes
20937 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20938 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
20939 DAG.getConstant(0, dl, MVT::v16i1),
20940 FCmp, DAG.getIntPtrConstant(0, dl));
20941 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
20942 DAG.getBitcast(MVT::i16, Ins));
20945 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20946 Op.getOperand(1), Op.getOperand(2), Subtarget,
20948 case COMPRESS_EXPAND_IN_REG: {
20949 SDValue Mask = Op.getOperand(3);
20950 SDValue DataToCompress = Op.getOperand(1);
20951 SDValue PassThru = Op.getOperand(2);
20952 if (isAllOnesConstant(Mask)) // return data as is
20953 return Op.getOperand(1);
20955 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20957 Mask, PassThru, Subtarget, DAG);
20960 case FIXUPIMMS_MASKZ:
20962 case FIXUPIMM_MASKZ:{
20963 SDValue Src1 = Op.getOperand(1);
20964 SDValue Src2 = Op.getOperand(2);
20965 SDValue Src3 = Op.getOperand(3);
20966 SDValue Imm = Op.getOperand(4);
20967 SDValue Mask = Op.getOperand(5);
20968 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20969 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20970 // We specify 2 possible modes for intrinsics, with/without rounding
20972 // First, we check if the intrinsic have rounding mode (7 operands),
20973 // if not, we set rounding mode to "current".
20975 if (Op.getNumOperands() == 7)
20976 Rnd = Op.getOperand(6);
20978 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20979 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20980 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20981 Src1, Src2, Src3, Imm, Rnd),
20982 Mask, Passthru, Subtarget, DAG);
20983 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20984 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20985 Src1, Src2, Src3, Imm, Rnd),
20986 Mask, Passthru, Subtarget, DAG);
20989 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20990 // Clear the upper bits of the rounding immediate so that the legacy
20991 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20992 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20994 DAG.getConstant(0xf, dl, MVT::i32));
20995 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20996 Op.getOperand(1), RoundingMode);
20999 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
21000 // Clear the upper bits of the rounding immediate so that the legacy
21001 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
21002 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
21004 DAG.getConstant(0xf, dl, MVT::i32));
21005 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
21006 Op.getOperand(1), Op.getOperand(2), RoundingMode);
21014 default: return SDValue(); // Don't custom lower most intrinsics.
21016 // ptest and testp intrinsics. The intrinsic these come from are designed to
21017 // return an integer value, not just an instruction so lower it to the ptest
21018 // or testp pattern and a setcc for the result.
21019 case Intrinsic::x86_sse41_ptestz:
21020 case Intrinsic::x86_sse41_ptestc:
21021 case Intrinsic::x86_sse41_ptestnzc:
21022 case Intrinsic::x86_avx_ptestz_256:
21023 case Intrinsic::x86_avx_ptestc_256:
21024 case Intrinsic::x86_avx_ptestnzc_256:
21025 case Intrinsic::x86_avx_vtestz_ps:
21026 case Intrinsic::x86_avx_vtestc_ps:
21027 case Intrinsic::x86_avx_vtestnzc_ps:
21028 case Intrinsic::x86_avx_vtestz_pd:
21029 case Intrinsic::x86_avx_vtestc_pd:
21030 case Intrinsic::x86_avx_vtestnzc_pd:
21031 case Intrinsic::x86_avx_vtestz_ps_256:
21032 case Intrinsic::x86_avx_vtestc_ps_256:
21033 case Intrinsic::x86_avx_vtestnzc_ps_256:
21034 case Intrinsic::x86_avx_vtestz_pd_256:
21035 case Intrinsic::x86_avx_vtestc_pd_256:
21036 case Intrinsic::x86_avx_vtestnzc_pd_256: {
21037 bool IsTestPacked = false;
21038 X86::CondCode X86CC;
21040 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
21041 case Intrinsic::x86_avx_vtestz_ps:
21042 case Intrinsic::x86_avx_vtestz_pd:
21043 case Intrinsic::x86_avx_vtestz_ps_256:
21044 case Intrinsic::x86_avx_vtestz_pd_256:
21045 IsTestPacked = true;
21047 case Intrinsic::x86_sse41_ptestz:
21048 case Intrinsic::x86_avx_ptestz_256:
21050 X86CC = X86::COND_E;
21052 case Intrinsic::x86_avx_vtestc_ps:
21053 case Intrinsic::x86_avx_vtestc_pd:
21054 case Intrinsic::x86_avx_vtestc_ps_256:
21055 case Intrinsic::x86_avx_vtestc_pd_256:
21056 IsTestPacked = true;
21058 case Intrinsic::x86_sse41_ptestc:
21059 case Intrinsic::x86_avx_ptestc_256:
21061 X86CC = X86::COND_B;
21063 case Intrinsic::x86_avx_vtestnzc_ps:
21064 case Intrinsic::x86_avx_vtestnzc_pd:
21065 case Intrinsic::x86_avx_vtestnzc_ps_256:
21066 case Intrinsic::x86_avx_vtestnzc_pd_256:
21067 IsTestPacked = true;
21069 case Intrinsic::x86_sse41_ptestnzc:
21070 case Intrinsic::x86_avx_ptestnzc_256:
21072 X86CC = X86::COND_A;
21076 SDValue LHS = Op.getOperand(1);
21077 SDValue RHS = Op.getOperand(2);
21078 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
21079 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
21080 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
21081 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21084 case Intrinsic::x86_sse42_pcmpistria128:
21085 case Intrinsic::x86_sse42_pcmpestria128:
21086 case Intrinsic::x86_sse42_pcmpistric128:
21087 case Intrinsic::x86_sse42_pcmpestric128:
21088 case Intrinsic::x86_sse42_pcmpistrio128:
21089 case Intrinsic::x86_sse42_pcmpestrio128:
21090 case Intrinsic::x86_sse42_pcmpistris128:
21091 case Intrinsic::x86_sse42_pcmpestris128:
21092 case Intrinsic::x86_sse42_pcmpistriz128:
21093 case Intrinsic::x86_sse42_pcmpestriz128: {
21095 X86::CondCode X86CC;
21097 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
21098 case Intrinsic::x86_sse42_pcmpistria128:
21099 Opcode = X86ISD::PCMPISTR;
21100 X86CC = X86::COND_A;
21102 case Intrinsic::x86_sse42_pcmpestria128:
21103 Opcode = X86ISD::PCMPESTR;
21104 X86CC = X86::COND_A;
21106 case Intrinsic::x86_sse42_pcmpistric128:
21107 Opcode = X86ISD::PCMPISTR;
21108 X86CC = X86::COND_B;
21110 case Intrinsic::x86_sse42_pcmpestric128:
21111 Opcode = X86ISD::PCMPESTR;
21112 X86CC = X86::COND_B;
21114 case Intrinsic::x86_sse42_pcmpistrio128:
21115 Opcode = X86ISD::PCMPISTR;
21116 X86CC = X86::COND_O;
21118 case Intrinsic::x86_sse42_pcmpestrio128:
21119 Opcode = X86ISD::PCMPESTR;
21120 X86CC = X86::COND_O;
21122 case Intrinsic::x86_sse42_pcmpistris128:
21123 Opcode = X86ISD::PCMPISTR;
21124 X86CC = X86::COND_S;
21126 case Intrinsic::x86_sse42_pcmpestris128:
21127 Opcode = X86ISD::PCMPESTR;
21128 X86CC = X86::COND_S;
21130 case Intrinsic::x86_sse42_pcmpistriz128:
21131 Opcode = X86ISD::PCMPISTR;
21132 X86CC = X86::COND_E;
21134 case Intrinsic::x86_sse42_pcmpestriz128:
21135 Opcode = X86ISD::PCMPESTR;
21136 X86CC = X86::COND_E;
21139 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21140 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21141 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
21142 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
21143 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21146 case Intrinsic::x86_sse42_pcmpistri128:
21147 case Intrinsic::x86_sse42_pcmpestri128: {
21149 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
21150 Opcode = X86ISD::PCMPISTR;
21152 Opcode = X86ISD::PCMPESTR;
21154 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21155 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21156 return DAG.getNode(Opcode, dl, VTs, NewOps);
21159 case Intrinsic::x86_sse42_pcmpistrm128:
21160 case Intrinsic::x86_sse42_pcmpestrm128: {
21162 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
21163 Opcode = X86ISD::PCMPISTR;
21165 Opcode = X86ISD::PCMPESTR;
21167 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21168 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21169 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
21172 case Intrinsic::eh_sjlj_lsda: {
21173 MachineFunction &MF = DAG.getMachineFunction();
21174 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21175 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
21176 auto &Context = MF.getMMI().getContext();
21177 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
21178 Twine(MF.getFunctionNumber()));
21179 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
21180 DAG.getMCSymbol(S, PtrVT));
21183 case Intrinsic::x86_seh_lsda: {
21184 // Compute the symbol for the LSDA. We know it'll get emitted later.
21185 MachineFunction &MF = DAG.getMachineFunction();
21186 SDValue Op1 = Op.getOperand(1);
21187 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
21188 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
21189 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
21191 // Generate a simple absolute symbol reference. This intrinsic is only
21192 // supported on 32-bit Windows, which isn't PIC.
21193 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
21194 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
21197 case Intrinsic::x86_seh_recoverfp: {
21198 SDValue FnOp = Op.getOperand(1);
21199 SDValue IncomingFPOp = Op.getOperand(2);
21200 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
21201 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
21203 report_fatal_error(
21204 "llvm.x86.seh.recoverfp must take a function as the first argument");
21205 return recoverFramePointer(DAG, Fn, IncomingFPOp);
21208 case Intrinsic::localaddress: {
21209 // Returns one of the stack, base, or frame pointer registers, depending on
21210 // which is used to reference local variables.
21211 MachineFunction &MF = DAG.getMachineFunction();
21212 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21214 if (RegInfo->hasBasePointer(MF))
21215 Reg = RegInfo->getBaseRegister();
21216 else // This function handles the SP or FP case.
21217 Reg = RegInfo->getPtrSizedFrameRegister(MF);
21218 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
21223 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21224 SDValue Src, SDValue Mask, SDValue Base,
21225 SDValue Index, SDValue ScaleOp, SDValue Chain,
21226 const X86Subtarget &Subtarget) {
21228 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21229 // Scale must be constant.
21232 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21233 EVT MaskVT = Mask.getValueType();
21234 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21235 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21236 SDValue Segment = DAG.getRegister(0, MVT::i32);
21237 // If source is undef or we know it won't be used, use a zero vector
21238 // to break register dependency.
21239 // TODO: use undef instead and let BreakFalseDeps deal with it?
21240 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
21241 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21242 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
21243 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21244 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21245 return DAG.getMergeValues(RetOps, dl);
21248 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21249 SDValue Src, SDValue Mask, SDValue Base,
21250 SDValue Index, SDValue ScaleOp, SDValue Chain,
21251 const X86Subtarget &Subtarget) {
21253 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21254 // Scale must be constant.
21257 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21258 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21259 Index.getSimpleValueType().getVectorNumElements());
21261 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21262 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21263 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21264 SDValue Segment = DAG.getRegister(0, MVT::i32);
21265 // If source is undef or we know it won't be used, use a zero vector
21266 // to break register dependency.
21267 // TODO: use undef instead and let BreakFalseDeps deal with it?
21268 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
21269 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21270 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
21271 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21272 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21273 return DAG.getMergeValues(RetOps, dl);
21276 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21277 SDValue Src, SDValue Mask, SDValue Base,
21278 SDValue Index, SDValue ScaleOp, SDValue Chain,
21279 const X86Subtarget &Subtarget) {
21281 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21282 // Scale must be constant.
21285 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21286 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21287 SDValue Segment = DAG.getRegister(0, MVT::i32);
21288 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21289 Index.getSimpleValueType().getVectorNumElements());
21291 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21292 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
21293 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
21294 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21295 return SDValue(Res, 1);
21298 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21299 SDValue Mask, SDValue Base, SDValue Index,
21300 SDValue ScaleOp, SDValue Chain,
21301 const X86Subtarget &Subtarget) {
21303 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21304 // Scale must be constant.
21307 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21308 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21309 SDValue Segment = DAG.getRegister(0, MVT::i32);
21311 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
21312 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21313 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
21314 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
21315 return SDValue(Res, 0);
21318 /// Handles the lowering of builtin intrinsic that return the value
21319 /// of the extended control register.
21320 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
21322 const X86Subtarget &Subtarget,
21323 SmallVectorImpl<SDValue> &Results) {
21324 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21325 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21328 // The ECX register is used to select the index of the XCR register to
21331 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
21332 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
21333 Chain = SDValue(N1, 0);
21335 // Reads the content of XCR and returns it in registers EDX:EAX.
21336 if (Subtarget.is64Bit()) {
21337 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
21338 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21341 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
21342 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21345 Chain = HI.getValue(1);
21347 if (Subtarget.is64Bit()) {
21348 // Merge the two 32-bit values into a 64-bit one..
21349 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21350 DAG.getConstant(32, DL, MVT::i8));
21351 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21352 Results.push_back(Chain);
21356 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21357 SDValue Ops[] = { LO, HI };
21358 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21359 Results.push_back(Pair);
21360 Results.push_back(Chain);
21363 /// Handles the lowering of builtin intrinsics that read performance monitor
21364 /// counters (x86_rdpmc).
21365 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
21367 const X86Subtarget &Subtarget,
21368 SmallVectorImpl<SDValue> &Results) {
21369 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21370 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21373 // The ECX register is used to select the index of the performance counter
21375 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
21377 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
21379 // Reads the content of a 64-bit performance counter and returns it in the
21380 // registers EDX:EAX.
21381 if (Subtarget.is64Bit()) {
21382 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21383 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21386 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21387 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21390 Chain = HI.getValue(1);
21392 if (Subtarget.is64Bit()) {
21393 // The EAX register is loaded with the low-order 32 bits. The EDX register
21394 // is loaded with the supported high-order bits of the counter.
21395 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21396 DAG.getConstant(32, DL, MVT::i8));
21397 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21398 Results.push_back(Chain);
21402 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21403 SDValue Ops[] = { LO, HI };
21404 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21405 Results.push_back(Pair);
21406 Results.push_back(Chain);
21409 /// Handles the lowering of builtin intrinsics that read the time stamp counter
21410 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
21411 /// READCYCLECOUNTER nodes.
21412 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
21414 const X86Subtarget &Subtarget,
21415 SmallVectorImpl<SDValue> &Results) {
21416 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21417 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
21420 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
21421 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
21422 // and the EAX register is loaded with the low-order 32 bits.
21423 if (Subtarget.is64Bit()) {
21424 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21425 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21428 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21429 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21432 SDValue Chain = HI.getValue(1);
21434 if (Opcode == X86ISD::RDTSCP_DAG) {
21435 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21437 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
21438 // the ECX register. Add 'ecx' explicitly to the chain.
21439 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
21441 // Explicitly store the content of ECX at the location passed in input
21442 // to the 'rdtscp' intrinsic.
21443 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
21444 MachinePointerInfo());
21447 if (Subtarget.is64Bit()) {
21448 // The EDX register is loaded with the high-order 32 bits of the MSR, and
21449 // the EAX register is loaded with the low-order 32 bits.
21450 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21451 DAG.getConstant(32, DL, MVT::i8));
21452 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21453 Results.push_back(Chain);
21457 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21458 SDValue Ops[] = { LO, HI };
21459 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21460 Results.push_back(Pair);
21461 Results.push_back(Chain);
21464 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
21465 SelectionDAG &DAG) {
21466 SmallVector<SDValue, 2> Results;
21468 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
21470 return DAG.getMergeValues(Results, DL);
21473 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
21474 MachineFunction &MF = DAG.getMachineFunction();
21475 SDValue Chain = Op.getOperand(0);
21476 SDValue RegNode = Op.getOperand(2);
21477 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21479 report_fatal_error("EH registrations only live in functions using WinEH");
21481 // Cast the operand to an alloca, and remember the frame index.
21482 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
21484 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
21485 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
21487 // Return the chain operand without making any DAG nodes.
21491 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
21492 MachineFunction &MF = DAG.getMachineFunction();
21493 SDValue Chain = Op.getOperand(0);
21494 SDValue EHGuard = Op.getOperand(2);
21495 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21497 report_fatal_error("EHGuard only live in functions using WinEH");
21499 // Cast the operand to an alloca, and remember the frame index.
21500 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
21502 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
21503 EHInfo->EHGuardFrameIndex = FINode->getIndex();
21505 // Return the chain operand without making any DAG nodes.
21509 /// Emit Truncating Store with signed or unsigned saturation.
21511 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
21512 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
21513 SelectionDAG &DAG) {
21515 SDVTList VTs = DAG.getVTList(MVT::Other);
21516 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
21517 SDValue Ops[] = { Chain, Val, Ptr, Undef };
21519 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21520 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21523 /// Emit Masked Truncating Store with signed or unsigned saturation.
21525 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
21526 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
21527 MachineMemOperand *MMO, SelectionDAG &DAG) {
21529 SDVTList VTs = DAG.getVTList(MVT::Other);
21530 SDValue Ops[] = { Chain, Ptr, Mask, Val };
21532 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21533 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21536 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
21537 SelectionDAG &DAG) {
21538 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
21540 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
21543 case llvm::Intrinsic::x86_seh_ehregnode:
21544 return MarkEHRegistrationNode(Op, DAG);
21545 case llvm::Intrinsic::x86_seh_ehguard:
21546 return MarkEHGuard(Op, DAG);
21547 case llvm::Intrinsic::x86_flags_read_u32:
21548 case llvm::Intrinsic::x86_flags_read_u64:
21549 case llvm::Intrinsic::x86_flags_write_u32:
21550 case llvm::Intrinsic::x86_flags_write_u64: {
21551 // We need a frame pointer because this will get lowered to a PUSH/POP
21553 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21554 MFI.setHasCopyImplyingStackAdjustment(true);
21555 // Don't do anything here, we will expand these intrinsics out later
21556 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21559 case Intrinsic::x86_lwpins32:
21560 case Intrinsic::x86_lwpins64:
21561 case Intrinsic::x86_umwait:
21562 case Intrinsic::x86_tpause: {
21564 SDValue Chain = Op->getOperand(0);
21565 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21569 default: llvm_unreachable("Impossible intrinsic");
21570 case Intrinsic::x86_umwait:
21571 Opcode = X86ISD::UMWAIT;
21573 case Intrinsic::x86_tpause:
21574 Opcode = X86ISD::TPAUSE;
21576 case Intrinsic::x86_lwpins32:
21577 case Intrinsic::x86_lwpins64:
21578 Opcode = X86ISD::LWPINS;
21582 SDValue Operation =
21583 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
21584 Op->getOperand(3), Op->getOperand(4));
21585 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
21586 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21587 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21588 Operation.getValue(1));
21595 switch(IntrData->Type) {
21596 default: llvm_unreachable("Unknown Intrinsic Type");
21599 // Emit the node with the right value type.
21600 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21601 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21603 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21604 // Otherwise return the value from Rand, which is always 0, casted to i32.
21605 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21606 DAG.getConstant(1, dl, Op->getValueType(1)),
21607 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21608 SDValue(Result.getNode(), 1) };
21609 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21611 // Return { result, isValid, chain }.
21612 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21613 SDValue(Result.getNode(), 2));
21615 case GATHER_AVX2: {
21616 SDValue Chain = Op.getOperand(0);
21617 SDValue Src = Op.getOperand(2);
21618 SDValue Base = Op.getOperand(3);
21619 SDValue Index = Op.getOperand(4);
21620 SDValue Mask = Op.getOperand(5);
21621 SDValue Scale = Op.getOperand(6);
21622 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21623 Scale, Chain, Subtarget);
21626 //gather(v1, mask, index, base, scale);
21627 SDValue Chain = Op.getOperand(0);
21628 SDValue Src = Op.getOperand(2);
21629 SDValue Base = Op.getOperand(3);
21630 SDValue Index = Op.getOperand(4);
21631 SDValue Mask = Op.getOperand(5);
21632 SDValue Scale = Op.getOperand(6);
21633 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21637 //scatter(base, mask, index, v1, scale);
21638 SDValue Chain = Op.getOperand(0);
21639 SDValue Base = Op.getOperand(2);
21640 SDValue Mask = Op.getOperand(3);
21641 SDValue Index = Op.getOperand(4);
21642 SDValue Src = Op.getOperand(5);
21643 SDValue Scale = Op.getOperand(6);
21644 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21645 Scale, Chain, Subtarget);
21648 SDValue Hint = Op.getOperand(6);
21649 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21650 assert((HintVal == 2 || HintVal == 3) &&
21651 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21652 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21653 SDValue Chain = Op.getOperand(0);
21654 SDValue Mask = Op.getOperand(2);
21655 SDValue Index = Op.getOperand(3);
21656 SDValue Base = Op.getOperand(4);
21657 SDValue Scale = Op.getOperand(5);
21658 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21661 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21663 SmallVector<SDValue, 2> Results;
21664 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21666 return DAG.getMergeValues(Results, dl);
21668 // Read Performance Monitoring Counters.
21670 SmallVector<SDValue, 2> Results;
21671 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21672 return DAG.getMergeValues(Results, dl);
21674 // Get Extended Control Register.
21676 SmallVector<SDValue, 2> Results;
21677 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21678 return DAG.getMergeValues(Results, dl);
21680 // XTEST intrinsics.
21682 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21683 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21685 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21686 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21687 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21688 Ret, SDValue(InTrans.getNode(), 1));
21692 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21693 SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
21694 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21695 DAG.getConstant(-1, dl, MVT::i8));
21696 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21697 Op.getOperand(4), GenCF.getValue(1));
21698 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21699 Op.getOperand(5), MachinePointerInfo());
21700 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21701 SDValue Results[] = { SetCC, Store };
21702 return DAG.getMergeValues(Results, dl);
21704 case TRUNCATE_TO_MEM_VI8:
21705 case TRUNCATE_TO_MEM_VI16:
21706 case TRUNCATE_TO_MEM_VI32: {
21707 SDValue Mask = Op.getOperand(4);
21708 SDValue DataToTruncate = Op.getOperand(3);
21709 SDValue Addr = Op.getOperand(2);
21710 SDValue Chain = Op.getOperand(0);
21712 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21713 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21715 EVT MemVT = MemIntr->getMemoryVT();
21717 uint16_t TruncationOp = IntrData->Opc0;
21718 switch (TruncationOp) {
21719 case X86ISD::VTRUNC: {
21720 if (isAllOnesConstant(Mask)) // return just a truncate store
21721 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21722 MemIntr->getMemOperand());
21724 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21725 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21727 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21728 MemIntr->getMemOperand(), true /* truncating */);
21730 case X86ISD::VTRUNCUS:
21731 case X86ISD::VTRUNCS: {
21732 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21733 if (isAllOnesConstant(Mask))
21734 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21735 MemIntr->getMemOperand(), DAG);
21737 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21738 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21740 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21741 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21744 llvm_unreachable("Unsupported truncstore intrinsic");
21750 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21751 SelectionDAG &DAG) const {
21752 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21753 MFI.setReturnAddressIsTaken(true);
21755 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21758 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21760 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21763 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21764 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21765 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21766 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21767 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21768 MachinePointerInfo());
21771 // Just load the return address.
21772 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21773 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21774 MachinePointerInfo());
21777 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21778 SelectionDAG &DAG) const {
21779 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21780 return getReturnAddressFrameIndex(DAG);
21783 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21784 MachineFunction &MF = DAG.getMachineFunction();
21785 MachineFrameInfo &MFI = MF.getFrameInfo();
21786 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21787 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21788 EVT VT = Op.getValueType();
21790 MFI.setFrameAddressIsTaken(true);
21792 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21793 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21794 // is not possible to crawl up the stack without looking at the unwind codes
21796 int FrameAddrIndex = FuncInfo->getFAIndex();
21797 if (!FrameAddrIndex) {
21798 // Set up a frame object for the return address.
21799 unsigned SlotSize = RegInfo->getSlotSize();
21800 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21801 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21802 FuncInfo->setFAIndex(FrameAddrIndex);
21804 return DAG.getFrameIndex(FrameAddrIndex, VT);
21807 unsigned FrameReg =
21808 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21809 SDLoc dl(Op); // FIXME probably not meaningful
21810 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21811 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21812 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21813 "Invalid Frame Register!");
21814 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21816 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21817 MachinePointerInfo());
21821 // FIXME? Maybe this could be a TableGen attribute on some registers and
21822 // this table could be generated automatically from RegInfo.
21823 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21824 SelectionDAG &DAG) const {
21825 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21826 const MachineFunction &MF = DAG.getMachineFunction();
21828 unsigned Reg = StringSwitch<unsigned>(RegName)
21829 .Case("esp", X86::ESP)
21830 .Case("rsp", X86::RSP)
21831 .Case("ebp", X86::EBP)
21832 .Case("rbp", X86::RBP)
21835 if (Reg == X86::EBP || Reg == X86::RBP) {
21836 if (!TFI.hasFP(MF))
21837 report_fatal_error("register " + StringRef(RegName) +
21838 " is allocatable: function has no frame pointer");
21841 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21842 unsigned FrameReg =
21843 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21844 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21845 "Invalid Frame Register!");
21853 report_fatal_error("Invalid register name global variable");
21856 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21857 SelectionDAG &DAG) const {
21858 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21859 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21862 unsigned X86TargetLowering::getExceptionPointerRegister(
21863 const Constant *PersonalityFn) const {
21864 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21865 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21867 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21870 unsigned X86TargetLowering::getExceptionSelectorRegister(
21871 const Constant *PersonalityFn) const {
21872 // Funclet personalities don't use selectors (the runtime does the selection).
21873 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21874 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21877 bool X86TargetLowering::needsFixedCatchObjects() const {
21878 return Subtarget.isTargetWin64();
21881 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21882 SDValue Chain = Op.getOperand(0);
21883 SDValue Offset = Op.getOperand(1);
21884 SDValue Handler = Op.getOperand(2);
21887 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21888 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21889 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21890 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21891 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21892 "Invalid Frame Register!");
21893 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21894 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21896 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21897 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21899 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21900 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21901 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21903 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21904 DAG.getRegister(StoreAddrReg, PtrVT));
21907 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21908 SelectionDAG &DAG) const {
21910 // If the subtarget is not 64bit, we may need the global base reg
21911 // after isel expand pseudo, i.e., after CGBR pass ran.
21912 // Therefore, ask for the GlobalBaseReg now, so that the pass
21913 // inserts the code for us in case we need it.
21914 // Otherwise, we will end up in a situation where we will
21915 // reference a virtual register that is not defined!
21916 if (!Subtarget.is64Bit()) {
21917 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21918 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21920 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21921 DAG.getVTList(MVT::i32, MVT::Other),
21922 Op.getOperand(0), Op.getOperand(1));
21925 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21926 SelectionDAG &DAG) const {
21928 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21929 Op.getOperand(0), Op.getOperand(1));
21932 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21933 SelectionDAG &DAG) const {
21935 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21939 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21940 return Op.getOperand(0);
21943 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21944 SelectionDAG &DAG) const {
21945 SDValue Root = Op.getOperand(0);
21946 SDValue Trmp = Op.getOperand(1); // trampoline
21947 SDValue FPtr = Op.getOperand(2); // nested function
21948 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21951 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21952 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21954 if (Subtarget.is64Bit()) {
21955 SDValue OutChains[6];
21957 // Large code-model.
21958 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21959 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21961 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21962 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21964 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21966 // Load the pointer to the nested function into R11.
21967 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21968 SDValue Addr = Trmp;
21969 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21970 Addr, MachinePointerInfo(TrmpAddr));
21972 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21973 DAG.getConstant(2, dl, MVT::i64));
21975 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21976 /* Alignment = */ 2);
21978 // Load the 'nest' parameter value into R10.
21979 // R10 is specified in X86CallingConv.td
21980 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21981 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21982 DAG.getConstant(10, dl, MVT::i64));
21983 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21984 Addr, MachinePointerInfo(TrmpAddr, 10));
21986 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21987 DAG.getConstant(12, dl, MVT::i64));
21989 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21990 /* Alignment = */ 2);
21992 // Jump to the nested function.
21993 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21994 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21995 DAG.getConstant(20, dl, MVT::i64));
21996 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21997 Addr, MachinePointerInfo(TrmpAddr, 20));
21999 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
22000 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
22001 DAG.getConstant(22, dl, MVT::i64));
22002 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
22003 Addr, MachinePointerInfo(TrmpAddr, 22));
22005 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
22007 const Function *Func =
22008 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
22009 CallingConv::ID CC = Func->getCallingConv();
22014 llvm_unreachable("Unsupported calling convention");
22015 case CallingConv::C:
22016 case CallingConv::X86_StdCall: {
22017 // Pass 'nest' parameter in ECX.
22018 // Must be kept in sync with X86CallingConv.td
22019 NestReg = X86::ECX;
22021 // Check that ECX wasn't needed by an 'inreg' parameter.
22022 FunctionType *FTy = Func->getFunctionType();
22023 const AttributeList &Attrs = Func->getAttributes();
22025 if (!Attrs.isEmpty() && !Func->isVarArg()) {
22026 unsigned InRegCount = 0;
22029 for (FunctionType::param_iterator I = FTy->param_begin(),
22030 E = FTy->param_end(); I != E; ++I, ++Idx)
22031 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
22032 auto &DL = DAG.getDataLayout();
22033 // FIXME: should only count parameters that are lowered to integers.
22034 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
22037 if (InRegCount > 2) {
22038 report_fatal_error("Nest register in use - reduce number of inreg"
22044 case CallingConv::X86_FastCall:
22045 case CallingConv::X86_ThisCall:
22046 case CallingConv::Fast:
22047 // Pass 'nest' parameter in EAX.
22048 // Must be kept in sync with X86CallingConv.td
22049 NestReg = X86::EAX;
22053 SDValue OutChains[4];
22054 SDValue Addr, Disp;
22056 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22057 DAG.getConstant(10, dl, MVT::i32));
22058 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
22060 // This is storing the opcode for MOV32ri.
22061 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
22062 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
22064 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
22065 Trmp, MachinePointerInfo(TrmpAddr));
22067 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22068 DAG.getConstant(1, dl, MVT::i32));
22070 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
22071 /* Alignment = */ 1);
22073 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
22074 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22075 DAG.getConstant(5, dl, MVT::i32));
22076 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
22077 Addr, MachinePointerInfo(TrmpAddr, 5),
22078 /* Alignment = */ 1);
22080 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22081 DAG.getConstant(6, dl, MVT::i32));
22083 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
22084 /* Alignment = */ 1);
22086 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
22090 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
22091 SelectionDAG &DAG) const {
22093 The rounding mode is in bits 11:10 of FPSR, and has the following
22095 00 Round to nearest
22100 FLT_ROUNDS, on the other hand, expects the following:
22107 To perform the conversion, we do:
22108 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
22111 MachineFunction &MF = DAG.getMachineFunction();
22112 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
22113 unsigned StackAlignment = TFI.getStackAlignment();
22114 MVT VT = Op.getSimpleValueType();
22117 // Save FP Control Word to stack slot
22118 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
22119 SDValue StackSlot =
22120 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
22122 MachineMemOperand *MMO =
22123 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
22124 MachineMemOperand::MOStore, 2, 2);
22126 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
22127 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
22128 DAG.getVTList(MVT::Other),
22129 Ops, MVT::i16, MMO);
22131 // Load FP Control Word from stack slot
22133 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
22135 // Transform as necessary
22137 DAG.getNode(ISD::SRL, DL, MVT::i16,
22138 DAG.getNode(ISD::AND, DL, MVT::i16,
22139 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
22140 DAG.getConstant(11, DL, MVT::i8));
22142 DAG.getNode(ISD::SRL, DL, MVT::i16,
22143 DAG.getNode(ISD::AND, DL, MVT::i16,
22144 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
22145 DAG.getConstant(9, DL, MVT::i8));
22148 DAG.getNode(ISD::AND, DL, MVT::i16,
22149 DAG.getNode(ISD::ADD, DL, MVT::i16,
22150 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
22151 DAG.getConstant(1, DL, MVT::i16)),
22152 DAG.getConstant(3, DL, MVT::i16));
22154 return DAG.getNode((VT.getSizeInBits() < 16 ?
22155 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
22158 // Split an unary integer op into 2 half sized ops.
22159 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
22160 MVT VT = Op.getSimpleValueType();
22161 unsigned NumElems = VT.getVectorNumElements();
22162 unsigned SizeInBits = VT.getSizeInBits();
22163 MVT EltVT = VT.getVectorElementType();
22164 SDValue Src = Op.getOperand(0);
22165 assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
22166 "Src and Op should have the same element type!");
22168 // Extract the Lo/Hi vectors
22170 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
22171 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
22173 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
22174 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22175 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
22176 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
22179 // Decompose 256-bit ops into smaller 128-bit ops.
22180 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
22181 assert(Op.getSimpleValueType().is256BitVector() &&
22182 Op.getSimpleValueType().isInteger() &&
22183 "Only handle AVX 256-bit vector integer operation");
22184 return LowerVectorIntUnary(Op, DAG);
22187 // Decompose 512-bit ops into smaller 256-bit ops.
22188 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
22189 assert(Op.getSimpleValueType().is512BitVector() &&
22190 Op.getSimpleValueType().isInteger() &&
22191 "Only handle AVX 512-bit vector integer operation");
22192 return LowerVectorIntUnary(Op, DAG);
22195 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
22197 // i8/i16 vector implemented using dword LZCNT vector instruction
22198 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
22199 // split the vector, perform operation on it's Lo a Hi part and
22200 // concatenate the results.
22201 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
22202 const X86Subtarget &Subtarget) {
22203 assert(Op.getOpcode() == ISD::CTLZ);
22205 MVT VT = Op.getSimpleValueType();
22206 MVT EltVT = VT.getVectorElementType();
22207 unsigned NumElems = VT.getVectorNumElements();
22209 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
22210 "Unsupported element type");
22212 // Split vector, it's Lo and Hi parts will be handled in next iteration.
22213 if (NumElems > 16 ||
22214 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
22215 return LowerVectorIntUnary(Op, DAG);
22217 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
22218 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
22219 "Unsupported value type for operation");
22221 // Use native supported vector instruction vplzcntd.
22222 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
22223 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
22224 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
22225 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
22227 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
22230 // Lower CTLZ using a PSHUFB lookup table implementation.
22231 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
22232 const X86Subtarget &Subtarget,
22233 SelectionDAG &DAG) {
22234 MVT VT = Op.getSimpleValueType();
22235 int NumElts = VT.getVectorNumElements();
22236 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
22237 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
22239 // Per-nibble leading zero PSHUFB lookup table.
22240 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
22241 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
22242 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
22243 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
22245 SmallVector<SDValue, 64> LUTVec;
22246 for (int i = 0; i < NumBytes; ++i)
22247 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22248 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
22250 // Begin by bitcasting the input to byte vector, then split those bytes
22251 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
22252 // If the hi input nibble is zero then we add both results together, otherwise
22253 // we just take the hi result (by masking the lo result to zero before the
22255 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
22256 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
22258 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
22259 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
22260 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
22261 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
22263 if (CurrVT.is512BitVector()) {
22264 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22265 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
22266 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22268 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
22271 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
22272 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
22273 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
22274 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
22276 // Merge result back from vXi8 back to VT, working on the lo/hi halves
22277 // of the current vector width in the same way we did for the nibbles.
22278 // If the upper half of the input element is zero then add the halves'
22279 // leading zero counts together, otherwise just use the upper half's.
22280 // Double the width of the result until we are at target width.
22281 while (CurrVT != VT) {
22282 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
22283 int CurrNumElts = CurrVT.getVectorNumElements();
22284 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
22285 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
22286 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
22288 // Check if the upper half of the input element is zero.
22289 if (CurrVT.is512BitVector()) {
22290 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22291 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
22292 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22293 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22295 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
22296 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22298 HiZ = DAG.getBitcast(NextVT, HiZ);
22300 // Move the upper/lower halves to the lower bits as we'll be extending to
22301 // NextVT. Mask the lower result to zero if HiZ is true and add the results
22303 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
22304 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
22305 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
22306 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
22307 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
22314 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
22315 const X86Subtarget &Subtarget,
22316 SelectionDAG &DAG) {
22317 MVT VT = Op.getSimpleValueType();
22319 if (Subtarget.hasCDI() &&
22320 // vXi8 vectors need to be promoted to 512-bits for vXi32.
22321 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
22322 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
22324 // Decompose 256-bit ops into smaller 128-bit ops.
22325 if (VT.is256BitVector() && !Subtarget.hasInt256())
22326 return Lower256IntUnary(Op, DAG);
22328 // Decompose 512-bit ops into smaller 256-bit ops.
22329 if (VT.is512BitVector() && !Subtarget.hasBWI())
22330 return Lower512IntUnary(Op, DAG);
22332 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
22333 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
22336 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
22337 SelectionDAG &DAG) {
22338 MVT VT = Op.getSimpleValueType();
22340 unsigned NumBits = VT.getSizeInBits();
22342 unsigned Opc = Op.getOpcode();
22345 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
22347 Op = Op.getOperand(0);
22348 if (VT == MVT::i8) {
22349 // Zero extend to i32 since there is not an i8 bsr.
22351 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
22354 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
22355 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
22356 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
22358 if (Opc == ISD::CTLZ) {
22359 // If src is zero (i.e. bsr sets ZF), returns NumBits.
22362 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
22363 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22366 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
22369 // Finally xor with NumBits-1.
22370 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
22371 DAG.getConstant(NumBits - 1, dl, OpVT));
22374 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
22378 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
22379 MVT VT = Op.getSimpleValueType();
22380 unsigned NumBits = VT.getScalarSizeInBits();
22383 if (VT.isVector()) {
22384 SDValue N0 = Op.getOperand(0);
22385 SDValue Zero = DAG.getConstant(0, dl, VT);
22387 // lsb(x) = (x & -x)
22388 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
22389 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
22391 // cttz_undef(x) = (width - 1) - ctlz(lsb)
22392 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
22393 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
22394 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
22395 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
22398 // cttz(x) = ctpop(lsb - 1)
22399 SDValue One = DAG.getConstant(1, dl, VT);
22400 return DAG.getNode(ISD::CTPOP, dl, VT,
22401 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
22404 assert(Op.getOpcode() == ISD::CTTZ &&
22405 "Only scalar CTTZ requires custom lowering");
22407 // Issue a bsf (scan bits forward) which also sets EFLAGS.
22408 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22409 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
22411 // If src is zero (i.e. bsf sets ZF), returns NumBits.
22414 DAG.getConstant(NumBits, dl, VT),
22415 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22418 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
22421 /// Break a 256-bit integer operation into two new 128-bit ones and then
22422 /// concatenate the result back.
22423 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
22424 MVT VT = Op.getSimpleValueType();
22426 assert(VT.is256BitVector() && VT.isInteger() &&
22427 "Unsupported value type for operation");
22429 unsigned NumElems = VT.getVectorNumElements();
22432 // Extract the LHS vectors
22433 SDValue LHS = Op.getOperand(0);
22434 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
22435 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
22437 // Extract the RHS vectors
22438 SDValue RHS = Op.getOperand(1);
22439 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
22440 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
22442 MVT EltVT = VT.getVectorElementType();
22443 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22445 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22446 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22447 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22450 /// Break a 512-bit integer operation into two new 256-bit ones and then
22451 /// concatenate the result back.
22452 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
22453 MVT VT = Op.getSimpleValueType();
22455 assert(VT.is512BitVector() && VT.isInteger() &&
22456 "Unsupported value type for operation");
22458 unsigned NumElems = VT.getVectorNumElements();
22461 // Extract the LHS vectors
22462 SDValue LHS = Op.getOperand(0);
22463 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
22464 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
22466 // Extract the RHS vectors
22467 SDValue RHS = Op.getOperand(1);
22468 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
22469 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
22471 MVT EltVT = VT.getVectorElementType();
22472 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22474 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22475 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22476 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22479 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
22480 MVT VT = Op.getSimpleValueType();
22481 if (VT.getScalarType() == MVT::i1)
22482 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
22483 Op.getOperand(0), Op.getOperand(1));
22484 assert(Op.getSimpleValueType().is256BitVector() &&
22485 Op.getSimpleValueType().isInteger() &&
22486 "Only handle AVX 256-bit vector integer operation");
22487 return Lower256IntArith(Op, DAG);
22490 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
22491 MVT VT = Op.getSimpleValueType();
22492 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
22493 // Since X86 does not have CMOV for 8-bit integer, we don't convert
22494 // 8-bit integer abs to NEG and CMOV.
22496 SDValue N0 = Op.getOperand(0);
22497 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
22498 DAG.getConstant(0, DL, VT), N0);
22499 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
22500 SDValue(Neg.getNode(), 1)};
22501 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
22504 assert(Op.getSimpleValueType().is256BitVector() &&
22505 Op.getSimpleValueType().isInteger() &&
22506 "Only handle AVX 256-bit vector integer operation");
22507 return Lower256IntUnary(Op, DAG);
22510 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
22511 MVT VT = Op.getSimpleValueType();
22513 // For AVX1 cases, split to use legal ops (everything but v4i64).
22514 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
22515 return Lower256IntArith(Op, DAG);
22518 unsigned Opcode = Op.getOpcode();
22519 SDValue N0 = Op.getOperand(0);
22520 SDValue N1 = Op.getOperand(1);
22522 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
22523 // using the SMIN/SMAX instructions and flipping the signbit back.
22524 if (VT == MVT::v8i16) {
22525 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
22526 "Unexpected MIN/MAX opcode");
22527 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
22528 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
22529 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
22530 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
22531 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
22532 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
22535 // Else, expand to a compare/select.
22538 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
22539 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
22540 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
22541 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
22542 default: llvm_unreachable("Unknown MINMAX opcode");
22545 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
22546 return DAG.getSelect(DL, VT, Cond, N0, N1);
22549 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
22550 SelectionDAG &DAG) {
22552 MVT VT = Op.getSimpleValueType();
22554 if (VT.getScalarType() == MVT::i1)
22555 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22557 // Decompose 256-bit ops into smaller 128-bit ops.
22558 if (VT.is256BitVector() && !Subtarget.hasInt256())
22559 return Lower256IntArith(Op, DAG);
22561 SDValue A = Op.getOperand(0);
22562 SDValue B = Op.getOperand(1);
22564 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22565 // vector pairs, multiply and truncate.
22566 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22567 if (Subtarget.hasInt256()) {
22568 // For 512-bit vectors, split into 256-bit vectors to allow the
22569 // sign-extension to occur.
22570 if (VT == MVT::v64i8)
22571 return Lower512IntArith(Op, DAG);
22573 // For 256-bit vectors, split into 128-bit vectors to allow the
22574 // sign-extension to occur. We don't need this on AVX512BW as we can
22575 // safely sign-extend to v32i16.
22576 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22577 return Lower256IntArith(Op, DAG);
22579 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22580 return DAG.getNode(
22581 ISD::TRUNCATE, dl, VT,
22582 DAG.getNode(ISD::MUL, dl, ExVT,
22583 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22584 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22587 assert(VT == MVT::v16i8 &&
22588 "Pre-AVX2 support only supports v16i8 multiplication");
22589 MVT ExVT = MVT::v8i16;
22591 // Extract the lo parts and sign extend to i16
22592 // We're going to mask off the low byte of each result element of the
22593 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22595 const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
22596 4, -1, 5, -1, 6, -1, 7, -1};
22597 SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask);
22598 SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask);
22599 ALo = DAG.getBitcast(ExVT, ALo);
22600 BLo = DAG.getBitcast(ExVT, BLo);
22602 // Extract the hi parts and sign extend to i16
22603 // We're going to mask off the low byte of each result element of the
22604 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22606 const int HiShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1,
22607 12, -1, 13, -1, 14, -1, 15, -1};
22608 SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask);
22609 SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask);
22610 AHi = DAG.getBitcast(ExVT, AHi);
22611 BHi = DAG.getBitcast(ExVT, BHi);
22613 // Multiply, mask the lower 8bits of the lo/hi results and pack
22614 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22615 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22616 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22617 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22618 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22621 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22622 if (VT == MVT::v4i32) {
22623 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22624 "Should not custom lower when pmulld is available!");
22626 // Extract the odd parts.
22627 static const int UnpackMask[] = { 1, -1, 3, -1 };
22628 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22629 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22631 // Multiply the even parts.
22632 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22633 DAG.getBitcast(MVT::v2i64, A),
22634 DAG.getBitcast(MVT::v2i64, B));
22635 // Now multiply odd parts.
22636 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22637 DAG.getBitcast(MVT::v2i64, Aodds),
22638 DAG.getBitcast(MVT::v2i64, Bodds));
22640 Evens = DAG.getBitcast(VT, Evens);
22641 Odds = DAG.getBitcast(VT, Odds);
22643 // Merge the two vectors back together with a shuffle. This expands into 2
22645 static const int ShufMask[] = { 0, 4, 2, 6 };
22646 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22649 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22650 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22651 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
22653 // Ahi = psrlqi(a, 32);
22654 // Bhi = psrlqi(b, 32);
22656 // AloBlo = pmuludq(a, b);
22657 // AloBhi = pmuludq(a, Bhi);
22658 // AhiBlo = pmuludq(Ahi, b);
22660 // Hi = psllqi(AloBhi + AhiBlo, 32);
22661 // return AloBlo + Hi;
22662 KnownBits AKnown, BKnown;
22663 DAG.computeKnownBits(A, AKnown);
22664 DAG.computeKnownBits(B, BKnown);
22666 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22667 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
22668 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
22670 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22671 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
22672 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
22674 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22676 // Only multiply lo/hi halves that aren't known to be zero.
22677 SDValue AloBlo = Zero;
22678 if (!ALoIsZero && !BLoIsZero)
22679 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
22681 SDValue AloBhi = Zero;
22682 if (!ALoIsZero && !BHiIsZero) {
22683 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22684 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
22687 SDValue AhiBlo = Zero;
22688 if (!AHiIsZero && !BLoIsZero) {
22689 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22690 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
22693 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22694 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22696 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22699 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22700 SelectionDAG &DAG) {
22702 MVT VT = Op.getSimpleValueType();
22704 // Decompose 256-bit ops into smaller 128-bit ops.
22705 if (VT.is256BitVector() && !Subtarget.hasInt256())
22706 return Lower256IntArith(Op, DAG);
22708 // Only i8 vectors should need custom lowering after this.
22709 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22710 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22711 "Unsupported vector type");
22713 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22714 // logical shift down the upper half and pack back to i8.
22715 SDValue A = Op.getOperand(0);
22716 SDValue B = Op.getOperand(1);
22718 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22719 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22720 unsigned Opcode = Op.getOpcode();
22721 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22722 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22724 // For 512-bit vectors, split into 256-bit vectors to allow the
22725 // sign-extension to occur.
22726 if (VT == MVT::v64i8)
22727 return Lower512IntArith(Op, DAG);
22729 // AVX2 implementations - extend xmm subvectors to ymm.
22730 if (Subtarget.hasInt256()) {
22731 unsigned NumElems = VT.getVectorNumElements();
22732 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22733 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22735 if (VT == MVT::v32i8) {
22736 if (Subtarget.canExtendTo512BW()) {
22737 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22738 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22739 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22740 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22741 DAG.getConstant(8, dl, MVT::v32i16));
22742 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22744 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22745 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22746 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22747 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22748 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22749 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22750 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22751 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22752 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22753 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22754 DAG.getConstant(8, dl, MVT::v16i16));
22755 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22756 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22757 DAG.getConstant(8, dl, MVT::v16i16));
22758 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22759 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22760 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22761 16, 17, 18, 19, 20, 21, 22, 23};
22762 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22763 24, 25, 26, 27, 28, 29, 30, 31};
22764 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22765 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22766 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22769 assert(VT == MVT::v16i8 && "Unexpected VT");
22771 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22772 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22773 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22774 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22775 DAG.getConstant(8, dl, MVT::v16i16));
22776 // If we have BWI we can use truncate instruction.
22777 if (Subtarget.hasBWI())
22778 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22779 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22780 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22781 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22784 assert(VT == MVT::v16i8 &&
22785 "Pre-AVX2 support only supports v16i8 multiplication");
22786 MVT ExVT = MVT::v8i16;
22787 unsigned ExSSE41 = ISD::MULHU == Opcode ? ISD::ZERO_EXTEND_VECTOR_INREG
22788 : ISD::SIGN_EXTEND_VECTOR_INREG;
22790 // Extract the lo parts and zero/sign extend to i16.
22792 if (Subtarget.hasSSE41()) {
22793 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
22794 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
22796 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22797 -1, 4, -1, 5, -1, 6, -1, 7};
22798 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22799 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22800 ALo = DAG.getBitcast(ExVT, ALo);
22801 BLo = DAG.getBitcast(ExVT, BLo);
22802 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22803 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22806 // Extract the hi parts and zero/sign extend to i16.
22808 if (Subtarget.hasSSE41()) {
22809 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22810 -1, -1, -1, -1, -1, -1, -1, -1};
22811 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22812 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22813 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
22814 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
22816 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22817 -1, 12, -1, 13, -1, 14, -1, 15};
22818 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22819 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22820 AHi = DAG.getBitcast(ExVT, AHi);
22821 BHi = DAG.getBitcast(ExVT, BHi);
22822 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22823 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22826 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22827 // pack back to v16i8.
22828 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22829 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22830 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22831 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22832 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22835 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22836 assert(Subtarget.isTargetWin64() && "Unexpected target");
22837 EVT VT = Op.getValueType();
22838 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22839 "Unexpected return type for lowering");
22843 switch (Op->getOpcode()) {
22844 default: llvm_unreachable("Unexpected request for libcall!");
22845 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22846 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22847 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22848 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22849 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22850 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22854 SDValue InChain = DAG.getEntryNode();
22856 TargetLowering::ArgListTy Args;
22857 TargetLowering::ArgListEntry Entry;
22858 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22859 EVT ArgVT = Op->getOperand(i).getValueType();
22860 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22861 "Unexpected argument type for lowering");
22862 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22863 Entry.Node = StackPtr;
22864 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22865 MachinePointerInfo(), /* Alignment = */ 16);
22866 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22867 Entry.Ty = PointerType::get(ArgTy,0);
22868 Entry.IsSExt = false;
22869 Entry.IsZExt = false;
22870 Args.push_back(Entry);
22873 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22874 getPointerTy(DAG.getDataLayout()));
22876 TargetLowering::CallLoweringInfo CLI(DAG);
22877 CLI.setDebugLoc(dl)
22880 getLibcallCallingConv(LC),
22881 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22884 .setSExtResult(isSigned)
22885 .setZExtResult(!isSigned);
22887 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22888 return DAG.getBitcast(VT, CallInfo.first);
22891 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22892 SelectionDAG &DAG) {
22893 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22894 MVT VT = Op0.getSimpleValueType();
22897 // Decompose 256-bit ops into smaller 128-bit ops.
22898 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22899 unsigned Opcode = Op.getOpcode();
22900 unsigned NumElems = VT.getVectorNumElements();
22901 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22902 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22903 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22904 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22905 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22906 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22907 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22909 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22910 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22912 return DAG.getMergeValues(Ops, dl);
22915 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22916 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22917 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22919 int NumElts = VT.getVectorNumElements();
22921 // PMULxD operations multiply each even value (starting at 0) of LHS with
22922 // the related value of RHS and produce a widen result.
22923 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22924 // => <2 x i64> <ae|cg>
22926 // In other word, to have all the results, we need to perform two PMULxD:
22927 // 1. one with the even values.
22928 // 2. one with the odd values.
22929 // To achieve #2, with need to place the odd values at an even position.
22931 // Place the odd value at an even position (basically, shift all values 1
22932 // step to the left):
22933 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22934 // <a|b|c|d> => <b|undef|d|undef>
22935 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22936 makeArrayRef(&Mask[0], NumElts));
22937 // <e|f|g|h> => <f|undef|h|undef>
22938 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22939 makeArrayRef(&Mask[0], NumElts));
22941 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22943 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22944 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22946 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22947 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22948 // => <2 x i64> <ae|cg>
22949 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22950 DAG.getBitcast(MulVT, Op0),
22951 DAG.getBitcast(MulVT, Op1)));
22952 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22953 // => <2 x i64> <bf|dh>
22954 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22955 DAG.getBitcast(MulVT, Odd0),
22956 DAG.getBitcast(MulVT, Odd1)));
22958 // Shuffle it back into the right order.
22959 SmallVector<int, 16> HighMask(NumElts);
22960 SmallVector<int, 16> LowMask(NumElts);
22961 for (int i = 0; i != NumElts; ++i) {
22962 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22963 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22966 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22967 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22969 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22970 // unsigned multiply.
22971 if (IsSigned && !Subtarget.hasSSE41()) {
22972 SDValue ShAmt = DAG.getConstant(
22974 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22975 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22976 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22977 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22978 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22980 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22981 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22984 // The first result of MUL_LOHI is actually the low value, followed by the
22986 SDValue Ops[] = {Lows, Highs};
22987 return DAG.getMergeValues(Ops, dl);
22990 // Return true if the required (according to Opcode) shift-imm form is natively
22991 // supported by the Subtarget
22992 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22994 if (VT.getScalarSizeInBits() < 16)
22997 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22998 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
23001 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
23002 (VT.is256BitVector() && Subtarget.hasInt256());
23004 bool AShift = LShift && (Subtarget.hasAVX512() ||
23005 (VT != MVT::v2i64 && VT != MVT::v4i64));
23006 return (Opcode == ISD::SRA) ? AShift : LShift;
23009 // The shift amount is a variable, but it is the same for all vector lanes.
23010 // These instructions are defined together with shift-immediate.
23012 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
23014 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
23017 // Return true if the required (according to Opcode) variable-shift form is
23018 // natively supported by the Subtarget
23019 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
23022 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
23025 // vXi16 supported only on AVX-512, BWI
23026 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
23029 if (Subtarget.hasAVX512())
23032 bool LShift = VT.is128BitVector() || VT.is256BitVector();
23033 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
23034 return (Opcode == ISD::SRA) ? AShift : LShift;
23037 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
23038 const X86Subtarget &Subtarget) {
23039 MVT VT = Op.getSimpleValueType();
23041 SDValue R = Op.getOperand(0);
23042 SDValue Amt = Op.getOperand(1);
23044 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
23045 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23047 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
23048 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
23049 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
23050 SDValue Ex = DAG.getBitcast(ExVT, R);
23052 // ashr(R, 63) === cmp_slt(R, 0)
23053 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
23054 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
23055 "Unsupported PCMPGT op");
23056 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
23057 getZeroVector(VT, Subtarget, DAG, dl), R);
23060 if (ShiftAmt >= 32) {
23061 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
23063 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
23064 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23065 ShiftAmt - 32, DAG);
23066 if (VT == MVT::v2i64)
23067 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
23068 if (VT == MVT::v4i64)
23069 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23070 {9, 1, 11, 3, 13, 5, 15, 7});
23072 // SRA upper i32, SHL whole i64 and select lower i32.
23073 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23076 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
23077 Lower = DAG.getBitcast(ExVT, Lower);
23078 if (VT == MVT::v2i64)
23079 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
23080 if (VT == MVT::v4i64)
23081 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23082 {8, 1, 10, 3, 12, 5, 14, 7});
23084 return DAG.getBitcast(VT, Ex);
23087 // Optimize shl/srl/sra with constant shift amount.
23088 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23089 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
23090 uint64_t ShiftAmt = ShiftConst->getZExtValue();
23092 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23093 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23095 // i64 SRA needs to be performed as partial shifts.
23096 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
23097 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
23098 Op.getOpcode() == ISD::SRA)
23099 return ArithmeticShiftRight64(ShiftAmt);
23101 if (VT == MVT::v16i8 ||
23102 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
23103 VT == MVT::v64i8) {
23104 unsigned NumElts = VT.getVectorNumElements();
23105 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
23107 // Simple i8 add case
23108 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
23109 return DAG.getNode(ISD::ADD, dl, VT, R, R);
23111 // ashr(R, 7) === cmp_slt(R, 0)
23112 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
23113 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
23114 if (VT.is512BitVector()) {
23115 assert(VT == MVT::v64i8 && "Unexpected element type!");
23116 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R,
23118 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
23120 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
23123 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
23124 if (VT == MVT::v16i8 && Subtarget.hasXOP())
23127 if (Op.getOpcode() == ISD::SHL) {
23128 // Make a large shift.
23129 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
23131 SHL = DAG.getBitcast(VT, SHL);
23132 // Zero out the rightmost bits.
23133 return DAG.getNode(ISD::AND, dl, VT, SHL,
23134 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
23136 if (Op.getOpcode() == ISD::SRL) {
23137 // Make a large shift.
23138 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
23140 SRL = DAG.getBitcast(VT, SRL);
23141 // Zero out the leftmost bits.
23142 return DAG.getNode(ISD::AND, dl, VT, SRL,
23143 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
23145 if (Op.getOpcode() == ISD::SRA) {
23146 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
23147 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23149 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
23150 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
23151 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
23154 llvm_unreachable("Unknown shift opcode.");
23159 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23160 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
23161 if (!Subtarget.hasXOP() &&
23162 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
23163 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
23165 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
23166 unsigned SubVectorScale = 1;
23167 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23169 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
23170 Amt = Amt.getOperand(0);
23173 // Peek through any splat that was introduced for i64 shift vectorization.
23174 int SplatIndex = -1;
23175 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
23176 if (SVN->isSplat()) {
23177 SplatIndex = SVN->getSplatIndex();
23178 Amt = Amt.getOperand(0);
23179 assert(SplatIndex < (int)VT.getVectorNumElements() &&
23180 "Splat shuffle referencing second operand");
23183 if (Amt.getOpcode() != ISD::BITCAST ||
23184 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
23187 Amt = Amt.getOperand(0);
23188 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23189 (SubVectorScale * VT.getVectorNumElements());
23190 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
23191 uint64_t ShiftAmt = 0;
23192 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
23193 for (unsigned i = 0; i != Ratio; ++i) {
23194 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
23198 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
23201 // Check remaining shift amounts (if not a splat).
23202 if (SplatIndex < 0) {
23203 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23204 uint64_t ShAmt = 0;
23205 for (unsigned j = 0; j != Ratio; ++j) {
23206 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
23210 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
23212 if (ShAmt != ShiftAmt)
23217 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23218 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23220 if (Op.getOpcode() == ISD::SRA)
23221 return ArithmeticShiftRight64(ShiftAmt);
23227 // Determine if V is a splat value, and return the scalar.
23228 static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
23229 SelectionDAG &DAG, const X86Subtarget &Subtarget,
23231 V = peekThroughEXTRACT_SUBVECTORs(V);
23233 // Check if this is a splat build_vector node.
23234 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
23235 SDValue SplatAmt = BV->getSplatValue();
23236 if (SplatAmt && SplatAmt.isUndef())
23241 // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns.
23242 if (V.getOpcode() == ISD::SUB &&
23243 !SupportedVectorVarShift(VT, Subtarget, Opcode)) {
23244 SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0));
23245 SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1));
23247 // Ensure that the corresponding splat BV element is not UNDEF.
23248 BitVector UndefElts;
23249 BuildVectorSDNode *BV0 = dyn_cast<BuildVectorSDNode>(LHS);
23250 ShuffleVectorSDNode *SVN1 = dyn_cast<ShuffleVectorSDNode>(RHS);
23251 if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) {
23252 unsigned SplatIdx = (unsigned)SVN1->getSplatIndex();
23253 if (!UndefElts[SplatIdx])
23254 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23255 VT.getVectorElementType(), V,
23256 DAG.getIntPtrConstant(SplatIdx, dl));
23260 // Check if this is a shuffle node doing a splat.
23261 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V);
23262 if (!SVN || !SVN->isSplat())
23265 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
23266 SDValue InVec = V.getOperand(0);
23267 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
23268 assert((SplatIdx < VT.getVectorNumElements()) &&
23269 "Unexpected shuffle index found!");
23270 return InVec.getOperand(SplatIdx);
23271 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
23272 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2)))
23273 if (C->getZExtValue() == SplatIdx)
23274 return InVec.getOperand(1);
23277 // Avoid introducing an extract element from a shuffle.
23278 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23279 VT.getVectorElementType(), InVec,
23280 DAG.getIntPtrConstant(SplatIdx, dl));
23283 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
23284 const X86Subtarget &Subtarget) {
23285 MVT VT = Op.getSimpleValueType();
23287 SDValue R = Op.getOperand(0);
23288 SDValue Amt = Op.getOperand(1);
23289 unsigned Opcode = Op.getOpcode();
23291 unsigned X86OpcI = (Opcode == ISD::SHL) ? X86ISD::VSHLI :
23292 (Opcode == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23294 unsigned X86OpcV = (Opcode == ISD::SHL) ? X86ISD::VSHL :
23295 (Opcode == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
23297 Amt = peekThroughEXTRACT_SUBVECTORs(Amt);
23299 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
23300 if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) {
23301 MVT EltVT = VT.getVectorElementType();
23302 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
23303 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
23304 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
23305 else if (EltVT.bitsLT(MVT::i32))
23306 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
23308 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
23312 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23313 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
23314 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
23315 Amt = Amt.getOperand(0);
23316 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23317 VT.getVectorNumElements();
23318 std::vector<SDValue> Vals(Ratio);
23319 for (unsigned i = 0; i != Ratio; ++i)
23320 Vals[i] = Amt.getOperand(i);
23321 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23322 for (unsigned j = 0; j != Ratio; ++j)
23323 if (Vals[j] != Amt.getOperand(i + j))
23327 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
23328 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
23333 // Convert a shift/rotate left amount to a multiplication scale factor.
23334 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
23335 const X86Subtarget &Subtarget,
23336 SelectionDAG &DAG) {
23337 MVT VT = Amt.getSimpleValueType();
23338 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
23339 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
23340 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
23343 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
23344 SmallVector<SDValue, 8> Elts;
23345 MVT SVT = VT.getVectorElementType();
23346 unsigned SVTBits = SVT.getSizeInBits();
23347 APInt One(SVTBits, 1);
23348 unsigned NumElems = VT.getVectorNumElements();
23350 for (unsigned i = 0; i != NumElems; ++i) {
23351 SDValue Op = Amt->getOperand(i);
23352 if (Op->isUndef()) {
23353 Elts.push_back(Op);
23357 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
23358 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
23359 uint64_t ShAmt = C.getZExtValue();
23360 if (ShAmt >= SVTBits) {
23361 Elts.push_back(DAG.getUNDEF(SVT));
23364 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
23366 return DAG.getBuildVector(VT, dl, Elts);
23369 // If the target doesn't support variable shifts, use either FP conversion
23370 // or integer multiplication to avoid shifting each element individually.
23371 if (VT == MVT::v4i32) {
23372 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
23373 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
23374 DAG.getConstant(0x3f800000U, dl, VT));
23375 Amt = DAG.getBitcast(MVT::v4f32, Amt);
23376 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
23379 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
23380 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
23381 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23382 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
23383 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
23384 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
23385 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
23386 if (Subtarget.hasSSE41())
23387 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23389 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
23390 DAG.getBitcast(VT, Hi),
23391 {0, 2, 4, 6, 8, 10, 12, 14});
23397 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
23398 SelectionDAG &DAG) {
23399 MVT VT = Op.getSimpleValueType();
23401 SDValue R = Op.getOperand(0);
23402 SDValue Amt = Op.getOperand(1);
23403 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23405 assert(VT.isVector() && "Custom lowering only for vector shifts!");
23406 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
23408 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
23411 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
23414 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
23417 // XOP has 128-bit variable logical/arithmetic shifts.
23418 // +ve/-ve Amt = shift left/right.
23419 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
23420 VT == MVT::v8i16 || VT == MVT::v16i8)) {
23421 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
23422 SDValue Zero = DAG.getConstant(0, dl, VT);
23423 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
23425 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
23426 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
23427 if (Op.getOpcode() == ISD::SRA)
23428 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
23431 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
23432 // shifts per-lane and then shuffle the partial results back together.
23433 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
23434 // Splat the shift amounts so the scalar shifts above will catch it.
23435 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
23436 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
23437 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
23438 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
23439 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
23442 // i64 vector arithmetic shift can be emulated with the transform:
23443 // M = lshr(SIGN_MASK, Amt)
23444 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
23445 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
23446 Op.getOpcode() == ISD::SRA) {
23447 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
23448 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
23449 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23450 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
23451 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
23455 // If possible, lower this shift as a sequence of two shifts by
23456 // constant plus a BLENDing shuffle instead of scalarizing it.
23458 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
23460 // Could be rewritten as:
23461 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
23463 // The advantage is that the two shifts from the example would be
23464 // lowered as X86ISD::VSRLI nodes in parallel before blending.
23465 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
23466 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
23467 SDValue Amt1, Amt2;
23468 unsigned NumElts = VT.getVectorNumElements();
23469 SmallVector<int, 8> ShuffleMask;
23470 for (unsigned i = 0; i != NumElts; ++i) {
23471 SDValue A = Amt->getOperand(i);
23473 ShuffleMask.push_back(SM_SentinelUndef);
23476 if (!Amt1 || Amt1 == A) {
23477 ShuffleMask.push_back(i);
23481 if (!Amt2 || Amt2 == A) {
23482 ShuffleMask.push_back(i + NumElts);
23489 // Only perform this blend if we can perform it without loading a mask.
23490 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
23491 isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2) &&
23492 (VT != MVT::v16i16 ||
23493 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
23494 (VT == MVT::v4i32 || Subtarget.hasSSE41() ||
23495 Op.getOpcode() != ISD::SHL || canWidenShuffleElements(ShuffleMask))) {
23497 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
23498 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
23500 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
23501 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23502 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
23506 // If possible, lower this packed shift into a vector multiply instead of
23507 // expanding it into a sequence of scalar shifts.
23508 if (Op.getOpcode() == ISD::SHL)
23509 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
23510 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
23512 // v4i32 Non Uniform Shifts.
23513 // If the shift amount is constant we can shift each lane using the SSE2
23514 // immediate shifts, else we need to zero-extend each lane to the lower i64
23515 // and shift using the SSE2 variable shifts.
23516 // The separate results can then be blended together.
23517 if (VT == MVT::v4i32) {
23518 unsigned Opc = Op.getOpcode();
23519 SDValue Amt0, Amt1, Amt2, Amt3;
23521 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
23522 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
23523 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
23524 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
23526 // ISD::SHL is handled above but we include it here for completeness.
23529 llvm_unreachable("Unknown target vector shift node");
23531 Opc = X86ISD::VSHL;
23534 Opc = X86ISD::VSRL;
23537 Opc = X86ISD::VSRA;
23540 // The SSE2 shifts use the lower i64 as the same shift amount for
23541 // all lanes and the upper i64 is ignored. On AVX we're better off
23542 // just zero-extending, but for SSE just duplicating the top 16-bits is
23543 // cheaper and has the same effect for out of range values.
23544 if (Subtarget.hasAVX()) {
23545 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23546 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
23547 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
23548 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
23549 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
23551 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
23552 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23553 {4, 5, 6, 7, -1, -1, -1, -1});
23554 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23555 {0, 1, 1, 1, -1, -1, -1, -1});
23556 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23557 {2, 3, 3, 3, -1, -1, -1, -1});
23558 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23559 {0, 1, 1, 1, -1, -1, -1, -1});
23560 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23561 {2, 3, 3, 3, -1, -1, -1, -1});
23565 SDValue R0 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt0));
23566 SDValue R1 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt1));
23567 SDValue R2 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt2));
23568 SDValue R3 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt3));
23570 // Merge the shifted lane results optimally with/without PBLENDW.
23571 // TODO - ideally shuffle combining would handle this.
23572 if (Subtarget.hasSSE41()) {
23573 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
23574 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
23575 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
23577 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
23578 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
23579 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
23582 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
23583 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
23584 // make the existing SSE solution better.
23585 // NOTE: We honor prefered vector width before promoting to 512-bits.
23586 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
23587 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
23588 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
23589 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
23590 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
23591 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
23592 "Unexpected vector type");
23593 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
23594 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
23596 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23597 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
23598 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
23599 return DAG.getNode(ISD::TRUNCATE, dl, VT,
23600 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23603 if (VT == MVT::v16i8 ||
23604 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23605 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23606 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23607 unsigned ShiftOpcode = Op->getOpcode();
23609 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23610 if (VT.is512BitVector()) {
23611 // On AVX512BW targets we make use of the fact that VSELECT lowers
23612 // to a masked blend which selects bytes based just on the sign bit
23613 // extracted to a mask.
23614 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23615 V0 = DAG.getBitcast(VT, V0);
23616 V1 = DAG.getBitcast(VT, V1);
23617 Sel = DAG.getBitcast(VT, Sel);
23618 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
23620 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23621 } else if (Subtarget.hasSSE41()) {
23622 // On SSE41 targets we make use of the fact that VSELECT lowers
23623 // to PBLENDVB which selects bytes based just on the sign bit.
23624 V0 = DAG.getBitcast(VT, V0);
23625 V1 = DAG.getBitcast(VT, V1);
23626 Sel = DAG.getBitcast(VT, Sel);
23627 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23629 // On pre-SSE41 targets we test for the sign bit by comparing to
23630 // zero - a negative value will set all bits of the lanes to true
23631 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23632 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23633 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23634 return DAG.getSelect(dl, SelVT, C, V0, V1);
23637 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23638 // We can safely do this using i16 shifts as we're only interested in
23639 // the 3 lower bits of each byte.
23640 Amt = DAG.getBitcast(ExtVT, Amt);
23641 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23642 Amt = DAG.getBitcast(VT, Amt);
23644 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23645 // r = VSELECT(r, shift(r, 4), a);
23647 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23648 R = SignBitSelect(VT, Amt, M, R);
23651 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23653 // r = VSELECT(r, shift(r, 2), a);
23654 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23655 R = SignBitSelect(VT, Amt, M, R);
23658 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23660 // return VSELECT(r, shift(r, 1), a);
23661 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23662 R = SignBitSelect(VT, Amt, M, R);
23666 if (Op->getOpcode() == ISD::SRA) {
23667 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23668 // so we can correctly sign extend. We don't care what happens to the
23670 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23671 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23672 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23673 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23674 ALo = DAG.getBitcast(ExtVT, ALo);
23675 AHi = DAG.getBitcast(ExtVT, AHi);
23676 RLo = DAG.getBitcast(ExtVT, RLo);
23677 RHi = DAG.getBitcast(ExtVT, RHi);
23679 // r = VSELECT(r, shift(r, 4), a);
23680 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23681 DAG.getConstant(4, dl, ExtVT));
23682 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23683 DAG.getConstant(4, dl, ExtVT));
23684 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23685 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23688 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23689 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23691 // r = VSELECT(r, shift(r, 2), a);
23692 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23693 DAG.getConstant(2, dl, ExtVT));
23694 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23695 DAG.getConstant(2, dl, ExtVT));
23696 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23697 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23700 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23701 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23703 // r = VSELECT(r, shift(r, 1), a);
23704 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23705 DAG.getConstant(1, dl, ExtVT));
23706 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23707 DAG.getConstant(1, dl, ExtVT));
23708 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23709 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23711 // Logical shift the result back to the lower byte, leaving a zero upper
23713 // meaning that we can safely pack with PACKUSWB.
23715 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23717 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23718 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23722 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23723 MVT ExtVT = MVT::v8i32;
23724 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23725 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23726 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23727 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23728 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23729 ALo = DAG.getBitcast(ExtVT, ALo);
23730 AHi = DAG.getBitcast(ExtVT, AHi);
23731 RLo = DAG.getBitcast(ExtVT, RLo);
23732 RHi = DAG.getBitcast(ExtVT, RHi);
23733 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23734 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23735 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23736 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23737 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23740 if (VT == MVT::v8i16) {
23741 unsigned ShiftOpcode = Op->getOpcode();
23743 // If we have a constant shift amount, the non-SSE41 path is best as
23744 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23745 bool UseSSE41 = Subtarget.hasSSE41() &&
23746 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23748 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23749 // On SSE41 targets we make use of the fact that VSELECT lowers
23750 // to PBLENDVB which selects bytes based just on the sign bit.
23752 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23753 V0 = DAG.getBitcast(ExtVT, V0);
23754 V1 = DAG.getBitcast(ExtVT, V1);
23755 Sel = DAG.getBitcast(ExtVT, Sel);
23756 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23758 // On pre-SSE41 targets we splat the sign bit - a negative value will
23759 // set all bits of the lanes to true and VSELECT uses that in
23760 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23762 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23763 return DAG.getSelect(dl, VT, C, V0, V1);
23766 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23768 // On SSE41 targets we need to replicate the shift mask in both
23769 // bytes for PBLENDVB.
23772 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23773 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23775 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23778 // r = VSELECT(r, shift(r, 8), a);
23779 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23780 R = SignBitSelect(Amt, M, R);
23783 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23785 // r = VSELECT(r, shift(r, 4), a);
23786 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23787 R = SignBitSelect(Amt, M, R);
23790 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23792 // r = VSELECT(r, shift(r, 2), a);
23793 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23794 R = SignBitSelect(Amt, M, R);
23797 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23799 // return VSELECT(r, shift(r, 1), a);
23800 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23801 R = SignBitSelect(Amt, M, R);
23805 // Decompose 256-bit shifts into smaller 128-bit shifts.
23806 if (VT.is256BitVector())
23807 return Lower256IntArith(Op, DAG);
23812 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23813 SelectionDAG &DAG) {
23814 MVT VT = Op.getSimpleValueType();
23815 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23818 SDValue R = Op.getOperand(0);
23819 SDValue Amt = Op.getOperand(1);
23820 unsigned Opcode = Op.getOpcode();
23821 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23823 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
23824 // Attempt to rotate by immediate.
23826 SmallVector<APInt, 16> EltBits;
23827 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23828 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23829 return EltBits[0] == V;
23831 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23832 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23833 return DAG.getNode(Op, DL, VT, R,
23834 DAG.getConstant(RotateAmt, DL, MVT::i8));
23838 // Else, fall-back on VPROLV/VPRORV.
23842 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23844 // XOP has 128-bit vector variable + immediate rotates.
23845 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23846 if (Subtarget.hasXOP()) {
23847 // Split 256-bit integers.
23848 if (VT.is256BitVector())
23849 return Lower256IntArith(Op, DAG);
23850 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23852 // Attempt to rotate by immediate.
23853 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23854 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23855 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23856 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23857 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23858 DAG.getConstant(RotateAmt, DL, MVT::i8));
23862 // Use general rotate by variable (per-element).
23866 // Split 256-bit integers on pre-AVX2 targets.
23867 if (VT.is256BitVector() && !Subtarget.hasAVX2())
23868 return Lower256IntArith(Op, DAG);
23870 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
23871 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
23872 Subtarget.hasAVX2())) &&
23873 "Only vXi32/vXi16/vXi8 vector rotates supported");
23875 // Rotate by an uniform constant - expand back to shifts.
23876 // TODO - legalizers should be able to handle this.
23877 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23878 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23879 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23880 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23881 if (RotateAmt == 0)
23884 SDValue AmtR = DAG.getConstant(EltSizeInBits - RotateAmt, DL, VT);
23885 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23886 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23887 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23891 // Rotate by splat - expand back to shifts.
23892 // TODO - legalizers should be able to handle this.
23893 if ((EltSizeInBits >= 16 || Subtarget.hasBWI()) &&
23894 IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
23895 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23896 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23897 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23898 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23899 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23902 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
23904 if (EltSizeInBits == 8) {
23905 if (Subtarget.hasBWI()) {
23906 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23907 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23908 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23909 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23910 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23913 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23915 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23916 if (Subtarget.hasSSE41()) {
23917 // On SSE41 targets we make use of the fact that VSELECT lowers
23918 // to PBLENDVB which selects bytes based just on the sign bit.
23919 V0 = DAG.getBitcast(VT, V0);
23920 V1 = DAG.getBitcast(VT, V1);
23921 Sel = DAG.getBitcast(VT, Sel);
23922 return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
23924 // On pre-SSE41 targets we test for the sign bit by comparing to
23925 // zero - a negative value will set all bits of the lanes to true
23926 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23927 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, DL);
23928 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
23929 return DAG.getSelect(DL, SelVT, C, V0, V1);
23932 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23933 // We can safely do this using i16 shifts as we're only interested in
23934 // the 3 lower bits of each byte.
23935 Amt = DAG.getBitcast(ExtVT, Amt);
23936 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
23937 Amt = DAG.getBitcast(VT, Amt);
23939 // r = VSELECT(r, rot(r, 4), a);
23943 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
23944 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
23945 R = SignBitSelect(VT, Amt, M, R);
23948 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
23950 // r = VSELECT(r, rot(r, 2), a);
23953 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
23954 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
23955 R = SignBitSelect(VT, Amt, M, R);
23958 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
23960 // return VSELECT(r, rot(r, 1), a);
23963 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
23964 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
23965 return SignBitSelect(VT, Amt, M, R);
23968 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23969 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
23970 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
23972 // Best to fallback for all supported variable shifts.
23973 // AVX2 - best to fallback for non-constants as well.
23974 // TODO - legalizers should be able to handle this.
23975 if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
23976 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23977 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23978 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23979 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23980 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23983 // As with shifts, convert the rotation amount to a multiplication factor.
23984 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
23985 assert(Scale && "Failed to convert ROTL amount to scale");
23987 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
23988 if (EltSizeInBits == 16) {
23989 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
23990 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
23991 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23994 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
23995 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
23996 // that can then be OR'd with the lower 32-bits.
23997 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
23998 static const int OddMask[] = {1, -1, 3, -1};
23999 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
24000 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
24002 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
24003 DAG.getBitcast(MVT::v2i64, R),
24004 DAG.getBitcast(MVT::v2i64, Scale));
24005 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
24006 DAG.getBitcast(MVT::v2i64, R13),
24007 DAG.getBitcast(MVT::v2i64, Scale13));
24008 Res02 = DAG.getBitcast(VT, Res02);
24009 Res13 = DAG.getBitcast(VT, Res13);
24011 return DAG.getNode(ISD::OR, DL, VT,
24012 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
24013 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
24016 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
24017 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24018 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24019 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24020 // has only one use.
24021 SDNode *N = Op.getNode();
24022 SDValue LHS = N->getOperand(0);
24023 SDValue RHS = N->getOperand(1);
24024 unsigned BaseOp = 0;
24025 X86::CondCode Cond;
24027 switch (Op.getOpcode()) {
24028 default: llvm_unreachable("Unknown ovf instruction!");
24030 // A subtract of one will be selected as a INC. Note that INC doesn't
24031 // set CF, so we can't do this for UADDO.
24032 if (isOneConstant(RHS)) {
24033 BaseOp = X86ISD::INC;
24034 Cond = X86::COND_O;
24037 BaseOp = X86ISD::ADD;
24038 Cond = X86::COND_O;
24041 BaseOp = X86ISD::ADD;
24042 Cond = X86::COND_B;
24045 // A subtract of one will be selected as a DEC. Note that DEC doesn't
24046 // set CF, so we can't do this for USUBO.
24047 if (isOneConstant(RHS)) {
24048 BaseOp = X86ISD::DEC;
24049 Cond = X86::COND_O;
24052 BaseOp = X86ISD::SUB;
24053 Cond = X86::COND_O;
24056 BaseOp = X86ISD::SUB;
24057 Cond = X86::COND_B;
24060 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
24061 Cond = X86::COND_O;
24063 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
24064 if (N->getValueType(0) == MVT::i8) {
24065 BaseOp = X86ISD::UMUL8;
24066 Cond = X86::COND_O;
24069 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
24071 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
24073 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
24075 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24079 // Also sets EFLAGS.
24080 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
24081 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24083 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
24085 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24088 /// Returns true if the operand type is exactly twice the native width, and
24089 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
24090 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
24091 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
24092 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
24093 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
24096 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
24097 else if (OpWidth == 128)
24098 return Subtarget.hasCmpxchg16b();
24103 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
24104 return needsCmpXchgNb(SI->getValueOperand()->getType());
24107 // Note: this turns large loads into lock cmpxchg8b/16b.
24108 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
24109 TargetLowering::AtomicExpansionKind
24110 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
24111 auto PTy = cast<PointerType>(LI->getPointerOperandType());
24112 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
24113 : AtomicExpansionKind::None;
24116 TargetLowering::AtomicExpansionKind
24117 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
24118 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24119 Type *MemType = AI->getType();
24121 // If the operand is too big, we must see if cmpxchg8/16b is available
24122 // and default to library calls otherwise.
24123 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
24124 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
24125 : AtomicExpansionKind::None;
24128 AtomicRMWInst::BinOp Op = AI->getOperation();
24131 llvm_unreachable("Unknown atomic operation");
24132 case AtomicRMWInst::Xchg:
24133 case AtomicRMWInst::Add:
24134 case AtomicRMWInst::Sub:
24135 // It's better to use xadd, xsub or xchg for these in all cases.
24136 return AtomicExpansionKind::None;
24137 case AtomicRMWInst::Or:
24138 case AtomicRMWInst::And:
24139 case AtomicRMWInst::Xor:
24140 // If the atomicrmw's result isn't actually used, we can just add a "lock"
24141 // prefix to a normal instruction for these operations.
24142 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
24143 : AtomicExpansionKind::None;
24144 case AtomicRMWInst::Nand:
24145 case AtomicRMWInst::Max:
24146 case AtomicRMWInst::Min:
24147 case AtomicRMWInst::UMax:
24148 case AtomicRMWInst::UMin:
24149 // These always require a non-trivial set of data operations on x86. We must
24150 // use a cmpxchg loop.
24151 return AtomicExpansionKind::CmpXChg;
24156 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
24157 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24158 Type *MemType = AI->getType();
24159 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
24160 // there is no benefit in turning such RMWs into loads, and it is actually
24161 // harmful as it introduces a mfence.
24162 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
24165 auto Builder = IRBuilder<>(AI);
24166 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
24167 auto SSID = AI->getSyncScopeID();
24168 // We must restrict the ordering to avoid generating loads with Release or
24169 // ReleaseAcquire orderings.
24170 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
24171 auto Ptr = AI->getPointerOperand();
24173 // Before the load we need a fence. Here is an example lifted from
24174 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
24177 // x.store(1, relaxed);
24178 // r1 = y.fetch_add(0, release);
24180 // y.fetch_add(42, acquire);
24181 // r2 = x.load(relaxed);
24182 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
24183 // lowered to just a load without a fence. A mfence flushes the store buffer,
24184 // making the optimization clearly correct.
24185 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
24186 // otherwise, we might be able to be more aggressive on relaxed idempotent
24187 // rmw. In practice, they do not look useful, so we don't try to be
24188 // especially clever.
24189 if (SSID == SyncScope::SingleThread)
24190 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
24191 // the IR level, so we must wrap it in an intrinsic.
24194 if (!Subtarget.hasMFence())
24195 // FIXME: it might make sense to use a locked operation here but on a
24196 // different cache-line to prevent cache-line bouncing. In practice it
24197 // is probably a small win, and x86 processors without mfence are rare
24198 // enough that we do not bother.
24202 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
24203 Builder.CreateCall(MFence, {});
24205 // Finally we can emit the atomic load.
24206 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
24207 AI->getType()->getPrimitiveSizeInBits());
24208 Loaded->setAtomic(Order, SSID);
24209 AI->replaceAllUsesWith(Loaded);
24210 AI->eraseFromParent();
24214 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
24215 SelectionDAG &DAG) {
24217 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
24218 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
24219 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
24220 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
24222 // The only fence that needs an instruction is a sequentially-consistent
24223 // cross-thread fence.
24224 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
24225 FenceSSID == SyncScope::System) {
24226 if (Subtarget.hasMFence())
24227 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
24229 SDValue Chain = Op.getOperand(0);
24230 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
24232 DAG.getRegister(X86::ESP, MVT::i32), // Base
24233 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
24234 DAG.getRegister(0, MVT::i32), // Index
24235 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
24236 DAG.getRegister(0, MVT::i32), // Segment.
24240 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
24241 return SDValue(Res, 0);
24244 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
24245 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
24248 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
24249 SelectionDAG &DAG) {
24250 MVT T = Op.getSimpleValueType();
24254 switch(T.SimpleTy) {
24255 default: llvm_unreachable("Invalid value type!");
24256 case MVT::i8: Reg = X86::AL; size = 1; break;
24257 case MVT::i16: Reg = X86::AX; size = 2; break;
24258 case MVT::i32: Reg = X86::EAX; size = 4; break;
24260 assert(Subtarget.is64Bit() && "Node not type legal!");
24261 Reg = X86::RAX; size = 8;
24264 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
24265 Op.getOperand(2), SDValue());
24266 SDValue Ops[] = { cpIn.getValue(0),
24269 DAG.getTargetConstant(size, DL, MVT::i8),
24270 cpIn.getValue(1) };
24271 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24272 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
24273 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
24277 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
24278 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
24279 MVT::i32, cpOut.getValue(2));
24280 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
24282 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
24283 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
24284 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
24288 // Create MOVMSKB, taking into account whether we need to split for AVX1.
24289 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
24290 const X86Subtarget &Subtarget) {
24291 MVT InVT = V.getSimpleValueType();
24293 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
24295 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
24296 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
24297 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
24298 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
24299 DAG.getConstant(16, DL, MVT::i8));
24300 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
24303 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24306 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
24307 SelectionDAG &DAG) {
24308 SDValue Src = Op.getOperand(0);
24309 MVT SrcVT = Src.getSimpleValueType();
24310 MVT DstVT = Op.getSimpleValueType();
24312 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
24313 // half to v32i1 and concatenating the result.
24314 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
24315 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
24316 assert(Subtarget.hasBWI() && "Expected BWI target");
24318 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24319 DAG.getIntPtrConstant(0, dl));
24320 Lo = DAG.getBitcast(MVT::v32i1, Lo);
24321 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24322 DAG.getIntPtrConstant(1, dl));
24323 Hi = DAG.getBitcast(MVT::v32i1, Hi);
24324 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
24327 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
24328 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
24329 DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
24332 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
24333 EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
24334 DstVT.getVectorNumElements() / 2);
24335 Lo = DAG.getBitcast(CastVT, Lo);
24336 Hi = DAG.getBitcast(CastVT, Hi);
24337 return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
24340 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
24341 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
24342 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
24343 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
24345 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
24346 V = getPMOVMSKB(DL, V, DAG, Subtarget);
24347 return DAG.getZExtOrTrunc(V, DL, DstVT);
24350 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
24351 SrcVT == MVT::i64) {
24352 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24353 if (DstVT != MVT::f64)
24354 // This conversion needs to be expanded.
24357 SmallVector<SDValue, 16> Elts;
24361 if (SrcVT.isVector()) {
24362 NumElts = SrcVT.getVectorNumElements();
24363 SVT = SrcVT.getVectorElementType();
24365 // Widen the vector in input in the case of MVT::v2i32.
24366 // Example: from MVT::v2i32 to MVT::v4i32.
24367 for (unsigned i = 0, e = NumElts; i != e; ++i)
24368 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
24369 DAG.getIntPtrConstant(i, dl)));
24371 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
24372 "Unexpected source type in LowerBITCAST");
24373 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24374 DAG.getIntPtrConstant(0, dl)));
24375 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24376 DAG.getIntPtrConstant(1, dl)));
24380 // Explicitly mark the extra elements as Undef.
24381 Elts.append(NumElts, DAG.getUNDEF(SVT));
24383 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24384 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
24385 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
24386 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
24387 DAG.getIntPtrConstant(0, dl));
24390 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
24391 Subtarget.hasMMX() && "Unexpected custom BITCAST");
24392 assert((DstVT == MVT::i64 ||
24393 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
24394 "Unexpected custom BITCAST");
24395 // i64 <=> MMX conversions are Legal.
24396 if (SrcVT==MVT::i64 && DstVT.isVector())
24398 if (DstVT==MVT::i64 && SrcVT.isVector())
24400 // MMX <=> MMX conversions are Legal.
24401 if (SrcVT.isVector() && DstVT.isVector())
24403 // All other conversions need to be expanded.
24407 /// Compute the horizontal sum of bytes in V for the elements of VT.
24409 /// Requires V to be a byte vector and VT to be an integer vector type with
24410 /// wider elements than V's type. The width of the elements of VT determines
24411 /// how many bytes of V are summed horizontally to produce each element of the
24413 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
24414 const X86Subtarget &Subtarget,
24415 SelectionDAG &DAG) {
24417 MVT ByteVecVT = V.getSimpleValueType();
24418 MVT EltVT = VT.getVectorElementType();
24419 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
24420 "Expected value to have byte element type.");
24421 assert(EltVT != MVT::i8 &&
24422 "Horizontal byte sum only makes sense for wider elements!");
24423 unsigned VecSize = VT.getSizeInBits();
24424 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
24426 // PSADBW instruction horizontally add all bytes and leave the result in i64
24427 // chunks, thus directly computes the pop count for v2i64 and v4i64.
24428 if (EltVT == MVT::i64) {
24429 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24430 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24431 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
24432 return DAG.getBitcast(VT, V);
24435 if (EltVT == MVT::i32) {
24436 // We unpack the low half and high half into i32s interleaved with zeros so
24437 // that we can use PSADBW to horizontally sum them. The most useful part of
24438 // this is that it lines up the results of two PSADBW instructions to be
24439 // two v2i64 vectors which concatenated are the 4 population counts. We can
24440 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
24441 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
24442 SDValue V32 = DAG.getBitcast(VT, V);
24443 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
24444 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
24446 // Do the horizontal sums into two v2i64s.
24447 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24448 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24449 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24450 DAG.getBitcast(ByteVecVT, Low), Zeros);
24451 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24452 DAG.getBitcast(ByteVecVT, High), Zeros);
24454 // Merge them together.
24455 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
24456 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
24457 DAG.getBitcast(ShortVecVT, Low),
24458 DAG.getBitcast(ShortVecVT, High));
24460 return DAG.getBitcast(VT, V);
24463 // The only element type left is i16.
24464 assert(EltVT == MVT::i16 && "Unknown how to handle type");
24466 // To obtain pop count for each i16 element starting from the pop count for
24467 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
24468 // right by 8. It is important to shift as i16s as i8 vector shift isn't
24469 // directly supported.
24470 SDValue ShifterV = DAG.getConstant(8, DL, VT);
24471 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24472 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
24473 DAG.getBitcast(ByteVecVT, V));
24474 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24477 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
24478 const X86Subtarget &Subtarget,
24479 SelectionDAG &DAG) {
24480 MVT VT = Op.getSimpleValueType();
24481 MVT EltVT = VT.getVectorElementType();
24482 unsigned VecSize = VT.getSizeInBits();
24484 // Implement a lookup table in register by using an algorithm based on:
24485 // http://wm.ite.pl/articles/sse-popcount.html
24487 // The general idea is that every lower byte nibble in the input vector is an
24488 // index into a in-register pre-computed pop count table. We then split up the
24489 // input vector in two new ones: (1) a vector with only the shifted-right
24490 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
24491 // masked out higher ones) for each byte. PSHUFB is used separately with both
24492 // to index the in-register table. Next, both are added and the result is a
24493 // i8 vector where each element contains the pop count for input byte.
24495 // To obtain the pop count for elements != i8, we follow up with the same
24496 // approach and use additional tricks as described below.
24498 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
24499 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
24500 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
24501 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
24503 int NumByteElts = VecSize / 8;
24504 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
24505 SDValue In = DAG.getBitcast(ByteVecVT, Op);
24506 SmallVector<SDValue, 64> LUTVec;
24507 for (int i = 0; i < NumByteElts; ++i)
24508 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
24509 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
24510 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
24513 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
24514 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
24517 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
24519 // The input vector is used as the shuffle mask that index elements into the
24520 // LUT. After counting low and high nibbles, add the vector to obtain the
24521 // final pop count per i8 element.
24522 SDValue HighPopCnt =
24523 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
24524 SDValue LowPopCnt =
24525 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
24526 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
24528 if (EltVT == MVT::i8)
24531 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
24534 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
24535 const X86Subtarget &Subtarget,
24536 SelectionDAG &DAG) {
24537 MVT VT = Op.getSimpleValueType();
24538 assert(VT.is128BitVector() &&
24539 "Only 128-bit vector bitmath lowering supported.");
24541 int VecSize = VT.getSizeInBits();
24542 MVT EltVT = VT.getVectorElementType();
24543 int Len = EltVT.getSizeInBits();
24545 // This is the vectorized version of the "best" algorithm from
24546 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
24547 // with a minor tweak to use a series of adds + shifts instead of vector
24548 // multiplications. Implemented for all integer vector types. We only use
24549 // this when we don't have SSSE3 which allows a LUT-based lowering that is
24550 // much faster, even faster than using native popcnt instructions.
24552 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
24553 MVT VT = V.getSimpleValueType();
24554 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
24555 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
24557 auto GetMask = [&](SDValue V, APInt Mask) {
24558 MVT VT = V.getSimpleValueType();
24559 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
24560 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
24563 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
24564 // x86, so set the SRL type to have elements at least i16 wide. This is
24565 // correct because all of our SRLs are followed immediately by a mask anyways
24566 // that handles any bits that sneak into the high bits of the byte elements.
24567 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
24571 // v = v - ((v >> 1) & 0x55555555...)
24573 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
24574 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
24575 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
24577 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
24578 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
24579 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
24580 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
24581 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
24583 // v = (v + (v >> 4)) & 0x0F0F0F0F...
24584 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
24585 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
24586 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
24588 // At this point, V contains the byte-wise population count, and we are
24589 // merely doing a horizontal sum if necessary to get the wider element
24591 if (EltVT == MVT::i8)
24594 return LowerHorizontalByteSum(
24595 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
24599 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
24600 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
24601 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24602 SelectionDAG &DAG) {
24603 MVT VT = Op.getSimpleValueType();
24604 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
24605 "Unknown CTPOP type to handle");
24606 SDLoc DL(Op.getNode());
24607 SDValue Op0 = Op.getOperand(0);
24609 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
24610 if (Subtarget.hasVPOPCNTDQ()) {
24611 unsigned NumElems = VT.getVectorNumElements();
24612 assert((VT.getVectorElementType() == MVT::i8 ||
24613 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
24614 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
24615 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
24616 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
24617 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
24618 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
24622 if (!Subtarget.hasSSSE3()) {
24623 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
24624 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
24625 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
24628 // Decompose 256-bit ops into smaller 128-bit ops.
24629 if (VT.is256BitVector() && !Subtarget.hasInt256())
24630 return Lower256IntUnary(Op, DAG);
24632 // Decompose 512-bit ops into smaller 256-bit ops.
24633 if (VT.is512BitVector() && !Subtarget.hasBWI())
24634 return Lower512IntUnary(Op, DAG);
24636 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
24639 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24640 SelectionDAG &DAG) {
24641 assert(Op.getSimpleValueType().isVector() &&
24642 "We only do custom lowering for vector population count.");
24643 return LowerVectorCTPOP(Op, Subtarget, DAG);
24646 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
24647 MVT VT = Op.getSimpleValueType();
24648 SDValue In = Op.getOperand(0);
24651 // For scalars, its still beneficial to transfer to/from the SIMD unit to
24652 // perform the BITREVERSE.
24653 if (!VT.isVector()) {
24654 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
24655 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
24656 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
24657 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
24658 DAG.getIntPtrConstant(0, DL));
24661 int NumElts = VT.getVectorNumElements();
24662 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
24664 // Decompose 256-bit ops into smaller 128-bit ops.
24665 if (VT.is256BitVector())
24666 return Lower256IntUnary(Op, DAG);
24668 assert(VT.is128BitVector() &&
24669 "Only 128-bit vector bitreverse lowering supported.");
24671 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
24672 // perform the BSWAP in the shuffle.
24673 // Its best to shuffle using the second operand as this will implicitly allow
24674 // memory folding for multiple vectors.
24675 SmallVector<SDValue, 16> MaskElts;
24676 for (int i = 0; i != NumElts; ++i) {
24677 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
24678 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
24679 int PermuteByte = SourceByte | (2 << 5);
24680 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
24684 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
24685 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
24686 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
24688 return DAG.getBitcast(VT, Res);
24691 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
24692 SelectionDAG &DAG) {
24693 MVT VT = Op.getSimpleValueType();
24695 if (Subtarget.hasXOP() && !VT.is512BitVector())
24696 return LowerBITREVERSE_XOP(Op, DAG);
24698 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
24700 SDValue In = Op.getOperand(0);
24703 unsigned NumElts = VT.getVectorNumElements();
24704 assert(VT.getScalarType() == MVT::i8 &&
24705 "Only byte vector BITREVERSE supported");
24707 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
24708 if (VT.is256BitVector() && !Subtarget.hasInt256())
24709 return Lower256IntUnary(Op, DAG);
24711 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24712 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24713 // 0-15 value (moved to the other nibble).
24714 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
24715 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
24716 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
24718 const int LoLUT[16] = {
24719 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
24720 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
24721 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
24722 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
24723 const int HiLUT[16] = {
24724 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
24725 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
24726 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
24727 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
24729 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
24730 for (unsigned i = 0; i < NumElts; ++i) {
24731 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
24732 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
24735 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
24736 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
24737 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24738 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
24739 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24742 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
24743 const X86Subtarget &Subtarget,
24744 bool AllowIncDec = true) {
24745 unsigned NewOpc = 0;
24746 switch (N->getOpcode()) {
24747 case ISD::ATOMIC_LOAD_ADD:
24748 NewOpc = X86ISD::LADD;
24750 case ISD::ATOMIC_LOAD_SUB:
24751 NewOpc = X86ISD::LSUB;
24753 case ISD::ATOMIC_LOAD_OR:
24754 NewOpc = X86ISD::LOR;
24756 case ISD::ATOMIC_LOAD_XOR:
24757 NewOpc = X86ISD::LXOR;
24759 case ISD::ATOMIC_LOAD_AND:
24760 NewOpc = X86ISD::LAND;
24763 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
24766 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
24768 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
24769 // Convert to inc/dec if they aren't slow or we are optimizing for size.
24770 if (AllowIncDec && (!Subtarget.slowIncDec() ||
24771 DAG.getMachineFunction().getFunction().optForSize())) {
24772 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
24773 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
24774 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
24775 DAG.getVTList(MVT::i32, MVT::Other),
24776 {N->getOperand(0), N->getOperand(1)},
24777 /*MemVT=*/N->getSimpleValueType(0), MMO);
24778 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
24779 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
24780 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
24781 DAG.getVTList(MVT::i32, MVT::Other),
24782 {N->getOperand(0), N->getOperand(1)},
24783 /*MemVT=*/N->getSimpleValueType(0), MMO);
24787 return DAG.getMemIntrinsicNode(
24788 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
24789 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
24790 /*MemVT=*/N->getSimpleValueType(0), MMO);
24793 /// Lower atomic_load_ops into LOCK-prefixed operations.
24794 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
24795 const X86Subtarget &Subtarget) {
24796 SDValue Chain = N->getOperand(0);
24797 SDValue LHS = N->getOperand(1);
24798 SDValue RHS = N->getOperand(2);
24799 unsigned Opc = N->getOpcode();
24800 MVT VT = N->getSimpleValueType(0);
24803 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24804 // can only be lowered when the result is unused. They should have already
24805 // been transformed into a cmpxchg loop in AtomicExpand.
24806 if (N->hasAnyUseOfValue(0)) {
24807 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24808 // select LXADD if LOCK_SUB can't be selected.
24809 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24810 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24811 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24812 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24813 RHS, AN->getMemOperand());
24815 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24816 "Used AtomicRMW ops other than Add should have been expanded!");
24820 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24821 // RAUW the chain, but don't worry about the result, as it's unused.
24822 assert(!N->hasAnyUseOfValue(0));
24823 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24827 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24828 SDNode *Node = Op.getNode();
24830 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24832 // Convert seq_cst store -> xchg
24833 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24834 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24835 // (The only way to get a 16-byte store is cmpxchg16b)
24836 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24837 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24838 AtomicOrdering::SequentiallyConsistent ||
24839 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24840 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24841 cast<AtomicSDNode>(Node)->getMemoryVT(),
24842 Node->getOperand(0),
24843 Node->getOperand(1), Node->getOperand(2),
24844 cast<AtomicSDNode>(Node)->getMemOperand());
24845 return Swap.getValue(1);
24847 // Other atomic stores have a simple pattern.
24851 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24852 SDNode *N = Op.getNode();
24853 MVT VT = N->getSimpleValueType(0);
24855 // Let legalize expand this if it isn't a legal type yet.
24856 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24859 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24862 // Set the carry flag.
24863 SDValue Carry = Op.getOperand(2);
24864 EVT CarryVT = Carry.getValueType();
24865 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24866 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24867 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24869 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24870 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24871 Op.getOperand(1), Carry.getValue(1));
24873 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24874 if (N->getValueType(1) == MVT::i1)
24875 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24877 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24880 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24881 SelectionDAG &DAG) {
24882 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24884 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24885 // which returns the values as { float, float } (in XMM0) or
24886 // { double, double } (which is returned in XMM0, XMM1).
24888 SDValue Arg = Op.getOperand(0);
24889 EVT ArgVT = Arg.getValueType();
24890 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24892 TargetLowering::ArgListTy Args;
24893 TargetLowering::ArgListEntry Entry;
24897 Entry.IsSExt = false;
24898 Entry.IsZExt = false;
24899 Args.push_back(Entry);
24901 bool isF64 = ArgVT == MVT::f64;
24902 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24903 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24904 // the results are returned via SRet in memory.
24905 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24906 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
24907 const char *LibcallName = TLI.getLibcallName(LC);
24909 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24911 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24912 : (Type *)VectorType::get(ArgTy, 4);
24914 TargetLowering::CallLoweringInfo CLI(DAG);
24915 CLI.setDebugLoc(dl)
24916 .setChain(DAG.getEntryNode())
24917 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24919 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24922 // Returned in xmm0 and xmm1.
24923 return CallResult.first;
24925 // Returned in bits 0:31 and 32:64 xmm0.
24926 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24927 CallResult.first, DAG.getIntPtrConstant(0, dl));
24928 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24929 CallResult.first, DAG.getIntPtrConstant(1, dl));
24930 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24931 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24934 /// Widen a vector input to a vector of NVT. The
24935 /// input vector must have the same element type as NVT.
24936 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24937 bool FillWithZeroes = false) {
24938 // Check if InOp already has the right width.
24939 MVT InVT = InOp.getSimpleValueType();
24943 if (InOp.isUndef())
24944 return DAG.getUNDEF(NVT);
24946 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24947 "input and widen element type must match");
24949 unsigned InNumElts = InVT.getVectorNumElements();
24950 unsigned WidenNumElts = NVT.getVectorNumElements();
24951 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24952 "Unexpected request for vector widening");
24955 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24956 InOp.getNumOperands() == 2) {
24957 SDValue N1 = InOp.getOperand(1);
24958 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24960 InOp = InOp.getOperand(0);
24961 InVT = InOp.getSimpleValueType();
24962 InNumElts = InVT.getVectorNumElements();
24965 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24966 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24967 SmallVector<SDValue, 16> Ops;
24968 for (unsigned i = 0; i < InNumElts; ++i)
24969 Ops.push_back(InOp.getOperand(i));
24971 EVT EltVT = InOp.getOperand(0).getValueType();
24973 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24974 DAG.getUNDEF(EltVT);
24975 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24976 Ops.push_back(FillVal);
24977 return DAG.getBuildVector(NVT, dl, Ops);
24979 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24981 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24982 InOp, DAG.getIntPtrConstant(0, dl));
24985 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24986 SelectionDAG &DAG) {
24987 assert(Subtarget.hasAVX512() &&
24988 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24990 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24991 SDValue Src = N->getValue();
24992 MVT VT = Src.getSimpleValueType();
24993 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
24996 SDValue Scale = N->getScale();
24997 SDValue Index = N->getIndex();
24998 SDValue Mask = N->getMask();
24999 SDValue Chain = N->getChain();
25000 SDValue BasePtr = N->getBasePtr();
25002 if (VT == MVT::v2f32) {
25003 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25004 // If the index is v2i64 and we have VLX we can use xmm for data and index.
25005 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
25006 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25007 DAG.getUNDEF(MVT::v2f32));
25008 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
25009 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25010 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25011 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25012 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25013 return SDValue(NewScatter.getNode(), 1);
25018 if (VT == MVT::v2i32) {
25019 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25020 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
25021 DAG.getUNDEF(MVT::v2i32));
25022 // If the index is v2i64 and we have VLX we can use xmm for data and index.
25023 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
25024 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
25025 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25026 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25027 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25028 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25029 return SDValue(NewScatter.getNode(), 1);
25031 // Custom widen all the operands to avoid promotion.
25032 EVT NewIndexVT = EVT::getVectorVT(
25033 *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
25034 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25035 DAG.getUNDEF(Index.getValueType()));
25036 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25037 DAG.getConstant(0, dl, MVT::v2i1));
25038 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25039 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
25040 Ops, N->getMemOperand());
25043 MVT IndexVT = Index.getSimpleValueType();
25044 MVT MaskVT = Mask.getSimpleValueType();
25046 // If the index is v2i32, we're being called by type legalization and we
25047 // should just let the default handling take care of it.
25048 if (IndexVT == MVT::v2i32)
25051 // If we don't have VLX and neither the passthru or index is 512-bits, we
25052 // need to widen until one is.
25053 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
25054 !Index.getSimpleValueType().is512BitVector()) {
25055 // Determine how much we need to widen by to get a 512-bit type.
25056 unsigned Factor = std::min(512/VT.getSizeInBits(),
25057 512/IndexVT.getSizeInBits());
25058 unsigned NumElts = VT.getVectorNumElements() * Factor;
25060 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25061 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25062 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25064 Src = ExtendToType(Src, VT, DAG);
25065 Index = ExtendToType(Index, IndexVT, DAG);
25066 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25069 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
25070 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25071 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25072 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25073 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25074 return SDValue(NewScatter.getNode(), 1);
25077 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
25078 SelectionDAG &DAG) {
25080 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
25081 MVT VT = Op.getSimpleValueType();
25082 MVT ScalarVT = VT.getScalarType();
25083 SDValue Mask = N->getMask();
25086 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
25087 "Expanding masked load is supported on AVX-512 target only!");
25089 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
25090 "Expanding masked load is supported for 32 and 64-bit types only!");
25092 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25093 "Cannot lower masked load op.");
25095 assert((ScalarVT.getSizeInBits() >= 32 ||
25096 (Subtarget.hasBWI() &&
25097 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25098 "Unsupported masked load op.");
25100 // This operation is legal for targets with VLX, but without
25101 // VLX the vector should be widened to 512 bit
25102 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
25103 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25104 SDValue Src0 = N->getSrc0();
25105 Src0 = ExtendToType(Src0, WideDataVT, DAG);
25107 // Mask element has to be i1.
25108 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25109 "Unexpected mask type");
25111 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25113 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25114 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
25115 N->getBasePtr(), Mask, Src0,
25116 N->getMemoryVT(), N->getMemOperand(),
25117 N->getExtensionType(),
25118 N->isExpandingLoad());
25120 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
25121 NewLoad.getValue(0),
25122 DAG.getIntPtrConstant(0, dl));
25123 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
25124 return DAG.getMergeValues(RetOps, dl);
25127 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
25128 SelectionDAG &DAG) {
25129 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
25130 SDValue DataToStore = N->getValue();
25131 MVT VT = DataToStore.getSimpleValueType();
25132 MVT ScalarVT = VT.getScalarType();
25133 SDValue Mask = N->getMask();
25136 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
25137 "Expanding masked load is supported on AVX-512 target only!");
25139 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
25140 "Expanding masked load is supported for 32 and 64-bit types only!");
25142 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25143 "Cannot lower masked store op.");
25145 assert((ScalarVT.getSizeInBits() >= 32 ||
25146 (Subtarget.hasBWI() &&
25147 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25148 "Unsupported masked store op.");
25150 // This operation is legal for targets with VLX, but without
25151 // VLX the vector should be widened to 512 bit
25152 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
25153 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25155 // Mask element has to be i1.
25156 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25157 "Unexpected mask type");
25159 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25161 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
25162 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25163 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
25164 Mask, N->getMemoryVT(), N->getMemOperand(),
25165 N->isTruncatingStore(), N->isCompressingStore());
25168 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
25169 SelectionDAG &DAG) {
25170 assert(Subtarget.hasAVX2() &&
25171 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
25173 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
25175 MVT VT = Op.getSimpleValueType();
25176 SDValue Index = N->getIndex();
25177 SDValue Mask = N->getMask();
25178 SDValue Src0 = N->getValue();
25179 MVT IndexVT = Index.getSimpleValueType();
25180 MVT MaskVT = Mask.getSimpleValueType();
25182 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
25184 // If the index is v2i32, we're being called by type legalization.
25185 if (IndexVT == MVT::v2i32)
25188 // If we don't have VLX and neither the passthru or index is 512-bits, we
25189 // need to widen until one is.
25191 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25192 !IndexVT.is512BitVector()) {
25193 // Determine how much we need to widen by to get a 512-bit type.
25194 unsigned Factor = std::min(512/VT.getSizeInBits(),
25195 512/IndexVT.getSizeInBits());
25197 unsigned NumElts = VT.getVectorNumElements() * Factor;
25199 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25200 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25201 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25203 Src0 = ExtendToType(Src0, VT, DAG);
25204 Index = ExtendToType(Index, IndexVT, DAG);
25205 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25208 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
25210 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25211 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
25212 N->getMemOperand());
25213 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
25214 NewGather, DAG.getIntPtrConstant(0, dl));
25215 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
25218 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
25219 SelectionDAG &DAG) const {
25220 // TODO: Eventually, the lowering of these nodes should be informed by or
25221 // deferred to the GC strategy for the function in which they appear. For
25222 // now, however, they must be lowered to something. Since they are logically
25223 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25224 // require special handling for these nodes), lower them as literal NOOPs for
25226 SmallVector<SDValue, 2> Ops;
25228 Ops.push_back(Op.getOperand(0));
25229 if (Op->getGluedNode())
25230 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25233 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25234 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25239 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
25240 SelectionDAG &DAG) const {
25241 // TODO: Eventually, the lowering of these nodes should be informed by or
25242 // deferred to the GC strategy for the function in which they appear. For
25243 // now, however, they must be lowered to something. Since they are logically
25244 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25245 // require special handling for these nodes), lower them as literal NOOPs for
25247 SmallVector<SDValue, 2> Ops;
25249 Ops.push_back(Op.getOperand(0));
25250 if (Op->getGluedNode())
25251 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25254 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25255 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25260 /// Provide custom lowering hooks for some operations.
25261 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
25262 switch (Op.getOpcode()) {
25263 default: llvm_unreachable("Should not custom lower this!");
25264 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
25265 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
25266 return LowerCMP_SWAP(Op, Subtarget, DAG);
25267 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
25268 case ISD::ATOMIC_LOAD_ADD:
25269 case ISD::ATOMIC_LOAD_SUB:
25270 case ISD::ATOMIC_LOAD_OR:
25271 case ISD::ATOMIC_LOAD_XOR:
25272 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
25273 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
25274 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
25275 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
25276 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
25277 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
25278 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
25279 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
25280 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
25281 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
25282 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
25283 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
25284 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
25285 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
25286 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
25287 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
25288 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
25289 case ISD::SHL_PARTS:
25290 case ISD::SRA_PARTS:
25291 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
25292 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
25293 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
25294 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
25295 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
25296 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
25297 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
25298 case ISD::ZERO_EXTEND_VECTOR_INREG:
25299 case ISD::SIGN_EXTEND_VECTOR_INREG:
25300 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
25301 case ISD::FP_TO_SINT:
25302 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
25303 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
25304 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
25305 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
25307 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
25308 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
25309 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
25310 case ISD::SETCC: return LowerSETCC(Op, DAG);
25311 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
25312 case ISD::SELECT: return LowerSELECT(Op, DAG);
25313 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
25314 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
25315 case ISD::VASTART: return LowerVASTART(Op, DAG);
25316 case ISD::VAARG: return LowerVAARG(Op, DAG);
25317 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
25318 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
25319 case ISD::INTRINSIC_VOID:
25320 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
25321 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
25322 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
25323 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
25324 case ISD::FRAME_TO_ARGS_OFFSET:
25325 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
25326 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
25327 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
25328 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
25329 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
25330 case ISD::EH_SJLJ_SETUP_DISPATCH:
25331 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
25332 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
25333 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
25334 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
25336 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
25338 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
25339 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
25341 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
25342 case ISD::UMUL_LOHI:
25343 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
25345 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
25348 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
25354 case ISD::UMULO: return LowerXALUO(Op, DAG);
25355 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
25356 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
25357 case ISD::ADDCARRY:
25358 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
25360 case ISD::SUB: return LowerADD_SUB(Op, DAG);
25364 case ISD::UMIN: return LowerMINMAX(Op, DAG);
25365 case ISD::ABS: return LowerABS(Op, DAG);
25366 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
25367 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
25368 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
25369 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
25370 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
25371 case ISD::GC_TRANSITION_START:
25372 return LowerGC_TRANSITION_START(Op, DAG);
25373 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
25377 /// Places new result values for the node in Results (their number
25378 /// and types must exactly match those of the original return values of
25379 /// the node), or leaves Results empty, which indicates that the node is not
25380 /// to be custom lowered after all.
25381 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
25382 SmallVectorImpl<SDValue> &Results,
25383 SelectionDAG &DAG) const {
25384 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
25386 if (!Res.getNode())
25389 assert((N->getNumValues() <= Res->getNumValues()) &&
25390 "Lowering returned the wrong number of results!");
25392 // Places new result values base on N result number.
25393 // In some cases (LowerSINT_TO_FP for example) Res has more result values
25394 // than original node, chain should be dropped(last value).
25395 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
25396 Results.push_back(Res.getValue(I));
25399 /// Replace a node with an illegal result type with a new node built out of
25401 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
25402 SmallVectorImpl<SDValue>&Results,
25403 SelectionDAG &DAG) const {
25405 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25406 switch (N->getOpcode()) {
25408 llvm_unreachable("Do not know how to custom type legalize this operation!");
25409 case X86ISD::AVG: {
25410 // Legalize types for X86ISD::AVG by expanding vectors.
25411 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25413 auto InVT = N->getValueType(0);
25414 assert(InVT.getSizeInBits() < 128);
25415 assert(128 % InVT.getSizeInBits() == 0);
25416 unsigned NumConcat = 128 / InVT.getSizeInBits();
25418 EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
25419 InVT.getVectorElementType(),
25420 NumConcat * InVT.getVectorNumElements());
25422 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
25423 Ops[0] = N->getOperand(0);
25424 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25425 Ops[0] = N->getOperand(1);
25426 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25428 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
25429 if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
25430 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
25431 DAG.getIntPtrConstant(0, dl));
25432 Results.push_back(Res);
25436 // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
25437 // setCC result type is v2i1 because type legalzation will end up with
25438 // a v4i1 setcc plus an extend.
25439 assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
25440 if (N->getOperand(0).getValueType() != MVT::v2f32)
25442 SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
25443 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25444 N->getOperand(0), UNDEF);
25445 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25446 N->getOperand(1), UNDEF);
25447 SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
25449 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25450 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25451 DAG.getIntPtrConstant(0, dl));
25452 Results.push_back(Res);
25455 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
25456 case X86ISD::FMINC:
25458 case X86ISD::FMAXC:
25459 case X86ISD::FMAX: {
25460 EVT VT = N->getValueType(0);
25461 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
25462 SDValue UNDEF = DAG.getUNDEF(VT);
25463 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25464 N->getOperand(0), UNDEF);
25465 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25466 N->getOperand(1), UNDEF);
25467 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
25475 case ISD::UDIVREM: {
25476 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
25477 Results.push_back(V);
25480 case ISD::FP_TO_SINT:
25481 case ISD::FP_TO_UINT: {
25482 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
25483 EVT VT = N->getValueType(0);
25484 SDValue Src = N->getOperand(0);
25485 EVT SrcVT = Src.getValueType();
25487 if (VT == MVT::v2i32) {
25488 assert((IsSigned || Subtarget.hasAVX512()) &&
25489 "Can only handle signed conversion without AVX512");
25490 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25491 if (Src.getValueType() == MVT::v2f64) {
25492 MVT ResVT = MVT::v4i32;
25493 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
25494 if (!IsSigned && !Subtarget.hasVLX()) {
25495 // Widen to 512-bits.
25496 ResVT = MVT::v8i32;
25497 Opc = ISD::FP_TO_UINT;
25498 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
25499 DAG.getUNDEF(MVT::v8f64),
25500 Src, DAG.getIntPtrConstant(0, dl));
25502 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
25503 bool WidenType = getTypeAction(*DAG.getContext(),
25504 MVT::v2i32) == TypeWidenVector;
25505 ResVT = WidenType ? MVT::v4i32 : MVT::v2i32;
25506 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
25507 DAG.getIntPtrConstant(0, dl));
25508 Results.push_back(Res);
25511 if (SrcVT == MVT::v2f32) {
25512 SDValue Idx = DAG.getIntPtrConstant(0, dl);
25513 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25514 DAG.getUNDEF(MVT::v2f32));
25515 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
25516 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
25517 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25518 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
25519 Results.push_back(Res);
25523 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
25524 // so early out here.
25528 if (Subtarget.hasDQI() && VT == MVT::i64 &&
25529 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
25530 assert(!Subtarget.is64Bit() && "i64 should be legal");
25531 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
25532 // Using a 256-bit input here to guarantee 128-bit input for f32 case.
25533 // TODO: Use 128-bit vectors for f64 case?
25534 // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
25535 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
25536 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
25538 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
25539 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
25540 DAG.getConstantFP(0.0, dl, VecInVT), Src,
25542 Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
25543 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
25544 Results.push_back(Res);
25548 std::pair<SDValue,SDValue> Vals =
25549 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
25550 SDValue FIST = Vals.first, StackSlot = Vals.second;
25551 if (FIST.getNode()) {
25552 // Return a load from the stack slot.
25553 if (StackSlot.getNode())
25555 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
25557 Results.push_back(FIST);
25561 case ISD::SINT_TO_FP: {
25562 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
25563 SDValue Src = N->getOperand(0);
25564 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
25566 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
25569 case ISD::UINT_TO_FP: {
25570 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25571 EVT VT = N->getValueType(0);
25572 if (VT != MVT::v2f32)
25574 SDValue Src = N->getOperand(0);
25575 EVT SrcVT = Src.getValueType();
25576 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
25577 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
25580 if (SrcVT != MVT::v2i32)
25582 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
25584 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
25585 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
25586 DAG.getBitcast(MVT::v2i64, VBias));
25587 Or = DAG.getBitcast(MVT::v2f64, Or);
25588 // TODO: Are there any fast-math-flags to propagate here?
25589 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
25590 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
25593 case ISD::FP_ROUND: {
25594 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
25596 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
25597 Results.push_back(V);
25600 case ISD::FP_EXTEND: {
25601 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
25602 // No other ValueType for FP_EXTEND should reach this point.
25603 assert(N->getValueType(0) == MVT::v2f32 &&
25604 "Do not know how to legalize this Node");
25607 case ISD::INTRINSIC_W_CHAIN: {
25608 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
25610 default : llvm_unreachable("Do not know how to custom type "
25611 "legalize this intrinsic operation!");
25612 case Intrinsic::x86_rdtsc:
25613 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25615 case Intrinsic::x86_rdtscp:
25616 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
25618 case Intrinsic::x86_rdpmc:
25619 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
25621 case Intrinsic::x86_xgetbv:
25622 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
25625 case ISD::INTRINSIC_WO_CHAIN: {
25626 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
25627 Results.push_back(V);
25630 case ISD::READCYCLECOUNTER: {
25631 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25634 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
25635 EVT T = N->getValueType(0);
25636 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
25637 bool Regs64bit = T == MVT::i128;
25638 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
25639 SDValue cpInL, cpInH;
25640 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25641 DAG.getConstant(0, dl, HalfT));
25642 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25643 DAG.getConstant(1, dl, HalfT));
25644 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
25645 Regs64bit ? X86::RAX : X86::EAX,
25647 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
25648 Regs64bit ? X86::RDX : X86::EDX,
25649 cpInH, cpInL.getValue(1));
25650 SDValue swapInL, swapInH;
25651 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25652 DAG.getConstant(0, dl, HalfT));
25653 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25654 DAG.getConstant(1, dl, HalfT));
25656 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
25657 swapInH, cpInH.getValue(1));
25658 // If the current function needs the base pointer, RBX,
25659 // we shouldn't use cmpxchg directly.
25660 // Indeed the lowering of that instruction will clobber
25661 // that register and since RBX will be a reserved register
25662 // the register allocator will not make sure its value will
25663 // be properly saved and restored around this live-range.
25664 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25666 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25667 unsigned BasePtr = TRI->getBaseRegister();
25668 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
25669 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
25670 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
25671 // ISel prefers the LCMPXCHG64 variant.
25672 // If that assert breaks, that means it is not the case anymore,
25673 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
25674 // not just EBX. This is a matter of accepting i64 input for that
25675 // pseudo, and restoring into the register of the right wide
25676 // in expand pseudo. Everything else should just work.
25677 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
25678 "Saving only half of the RBX");
25679 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
25680 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
25681 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
25682 Regs64bit ? X86::RBX : X86::EBX,
25683 HalfT, swapInH.getValue(1));
25684 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
25686 /*Glue*/ RBXSave.getValue(2)};
25687 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25690 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
25691 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
25692 Regs64bit ? X86::RBX : X86::EBX, swapInL,
25693 swapInH.getValue(1));
25694 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
25695 swapInL.getValue(1)};
25696 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25698 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
25699 Regs64bit ? X86::RAX : X86::EAX,
25700 HalfT, Result.getValue(1));
25701 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
25702 Regs64bit ? X86::RDX : X86::EDX,
25703 HalfT, cpOutL.getValue(2));
25704 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
25706 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
25707 MVT::i32, cpOutH.getValue(2));
25708 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
25709 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
25711 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
25712 Results.push_back(Success);
25713 Results.push_back(EFLAGS.getValue(1));
25716 case ISD::ATOMIC_SWAP:
25717 case ISD::ATOMIC_LOAD_ADD:
25718 case ISD::ATOMIC_LOAD_SUB:
25719 case ISD::ATOMIC_LOAD_AND:
25720 case ISD::ATOMIC_LOAD_OR:
25721 case ISD::ATOMIC_LOAD_XOR:
25722 case ISD::ATOMIC_LOAD_NAND:
25723 case ISD::ATOMIC_LOAD_MIN:
25724 case ISD::ATOMIC_LOAD_MAX:
25725 case ISD::ATOMIC_LOAD_UMIN:
25726 case ISD::ATOMIC_LOAD_UMAX:
25727 case ISD::ATOMIC_LOAD: {
25728 // Delegate to generic TypeLegalization. Situations we can really handle
25729 // should have already been dealt with by AtomicExpandPass.cpp.
25732 case ISD::BITCAST: {
25733 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25734 EVT DstVT = N->getValueType(0);
25735 EVT SrcVT = N->getOperand(0).getValueType();
25737 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
25738 // we can split using the k-register rather than memory.
25739 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
25740 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
25742 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25743 Lo = DAG.getBitcast(MVT::i32, Lo);
25744 Hi = DAG.getBitcast(MVT::i32, Hi);
25745 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
25746 Results.push_back(Res);
25750 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
25751 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
25752 SrcVT.isVector() && isTypeLegal(SrcVT)) {
25754 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25755 MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
25756 Lo = DAG.getBitcast(CastVT, Lo);
25757 Hi = DAG.getBitcast(CastVT, Hi);
25758 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
25759 Results.push_back(Res);
25763 if (SrcVT != MVT::f64 ||
25764 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
25767 unsigned NumElts = DstVT.getVectorNumElements();
25768 EVT SVT = DstVT.getVectorElementType();
25769 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
25770 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
25771 MVT::v2f64, N->getOperand(0));
25772 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
25774 if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
25775 // If we are legalizing vectors by widening, we already have the desired
25776 // legal vector type, just return it.
25777 Results.push_back(ToVecInt);
25781 SmallVector<SDValue, 8> Elts;
25782 for (unsigned i = 0, e = NumElts; i != e; ++i)
25783 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
25784 ToVecInt, DAG.getIntPtrConstant(i, dl)));
25786 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
25789 case ISD::MGATHER: {
25790 EVT VT = N->getValueType(0);
25791 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25792 auto *Gather = cast<MaskedGatherSDNode>(N);
25793 SDValue Index = Gather->getIndex();
25794 if (Index.getValueType() != MVT::v2i64)
25796 SDValue Mask = Gather->getMask();
25797 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25798 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25799 Gather->getValue(),
25800 DAG.getUNDEF(MVT::v2f32));
25801 if (!Subtarget.hasVLX()) {
25802 // We need to widen the mask, but the instruction will only use 2
25803 // of its elements. So we can use undef.
25804 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25805 DAG.getUNDEF(MVT::v2i1));
25806 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25808 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25809 Index, Gather->getScale() };
25810 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25811 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
25812 Gather->getMemoryVT(), Gather->getMemOperand());
25813 Results.push_back(Res);
25814 Results.push_back(Res.getValue(2));
25817 if (VT == MVT::v2i32) {
25818 auto *Gather = cast<MaskedGatherSDNode>(N);
25819 SDValue Index = Gather->getIndex();
25820 SDValue Mask = Gather->getMask();
25821 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25822 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
25823 Gather->getValue(),
25824 DAG.getUNDEF(MVT::v2i32));
25825 // If the index is v2i64 we can use it directly.
25826 if (Index.getValueType() == MVT::v2i64 &&
25827 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25828 if (!Subtarget.hasVLX()) {
25829 // We need to widen the mask, but the instruction will only use 2
25830 // of its elements. So we can use undef.
25831 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25832 DAG.getUNDEF(MVT::v2i1));
25833 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25835 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25836 Index, Gather->getScale() };
25837 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25838 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25839 Gather->getMemoryVT(), Gather->getMemOperand());
25840 SDValue Chain = Res.getValue(2);
25841 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
25842 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25843 DAG.getIntPtrConstant(0, dl));
25844 Results.push_back(Res);
25845 Results.push_back(Chain);
25848 EVT IndexVT = Index.getValueType();
25849 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25850 IndexVT.getScalarType(), 4);
25851 // Otherwise we need to custom widen everything to avoid promotion.
25852 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25853 DAG.getUNDEF(IndexVT));
25854 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25855 DAG.getConstant(0, dl, MVT::v2i1));
25856 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25857 Index, Gather->getScale() };
25858 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25859 Gather->getMemoryVT(), dl, Ops,
25860 Gather->getMemOperand());
25861 SDValue Chain = Res.getValue(1);
25862 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25863 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25864 DAG.getIntPtrConstant(0, dl));
25865 Results.push_back(Res);
25866 Results.push_back(Chain);
25874 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25875 switch ((X86ISD::NodeType)Opcode) {
25876 case X86ISD::FIRST_NUMBER: break;
25877 case X86ISD::BSF: return "X86ISD::BSF";
25878 case X86ISD::BSR: return "X86ISD::BSR";
25879 case X86ISD::SHLD: return "X86ISD::SHLD";
25880 case X86ISD::SHRD: return "X86ISD::SHRD";
25881 case X86ISD::FAND: return "X86ISD::FAND";
25882 case X86ISD::FANDN: return "X86ISD::FANDN";
25883 case X86ISD::FOR: return "X86ISD::FOR";
25884 case X86ISD::FXOR: return "X86ISD::FXOR";
25885 case X86ISD::FILD: return "X86ISD::FILD";
25886 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25887 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25888 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25889 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25890 case X86ISD::FLD: return "X86ISD::FLD";
25891 case X86ISD::FST: return "X86ISD::FST";
25892 case X86ISD::CALL: return "X86ISD::CALL";
25893 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25894 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25895 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25896 case X86ISD::BT: return "X86ISD::BT";
25897 case X86ISD::CMP: return "X86ISD::CMP";
25898 case X86ISD::COMI: return "X86ISD::COMI";
25899 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25900 case X86ISD::CMPM: return "X86ISD::CMPM";
25901 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25902 case X86ISD::SETCC: return "X86ISD::SETCC";
25903 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25904 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25905 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25906 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25907 case X86ISD::CMOV: return "X86ISD::CMOV";
25908 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25909 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25910 case X86ISD::IRET: return "X86ISD::IRET";
25911 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25912 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25913 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25914 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25915 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25916 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25917 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25918 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25919 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25920 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25921 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25922 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25923 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25924 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25925 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25926 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25927 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25928 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25929 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25930 case X86ISD::HADD: return "X86ISD::HADD";
25931 case X86ISD::HSUB: return "X86ISD::HSUB";
25932 case X86ISD::FHADD: return "X86ISD::FHADD";
25933 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25934 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25935 case X86ISD::FMAX: return "X86ISD::FMAX";
25936 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25937 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25938 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25939 case X86ISD::FMIN: return "X86ISD::FMIN";
25940 case X86ISD::FMINS: return "X86ISD::FMINS";
25941 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25942 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25943 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25944 case X86ISD::FMINC: return "X86ISD::FMINC";
25945 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25946 case X86ISD::FRCP: return "X86ISD::FRCP";
25947 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25948 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25949 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25950 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25951 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25952 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25953 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25954 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25955 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25956 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25957 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25958 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25959 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25960 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25961 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25962 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25963 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25964 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25965 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25966 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25967 case X86ISD::LADD: return "X86ISD::LADD";
25968 case X86ISD::LSUB: return "X86ISD::LSUB";
25969 case X86ISD::LOR: return "X86ISD::LOR";
25970 case X86ISD::LXOR: return "X86ISD::LXOR";
25971 case X86ISD::LAND: return "X86ISD::LAND";
25972 case X86ISD::LINC: return "X86ISD::LINC";
25973 case X86ISD::LDEC: return "X86ISD::LDEC";
25974 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25975 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25976 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25977 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25978 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25979 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25980 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25981 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25982 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25983 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25984 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25985 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25986 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25987 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25988 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25989 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25990 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25991 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25992 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25993 case X86ISD::VSHL: return "X86ISD::VSHL";
25994 case X86ISD::VSRL: return "X86ISD::VSRL";
25995 case X86ISD::VSRA: return "X86ISD::VSRA";
25996 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25997 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25998 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25999 case X86ISD::VSRAV: return "X86ISD::VSRAV";
26000 case X86ISD::VROTLI: return "X86ISD::VROTLI";
26001 case X86ISD::VROTRI: return "X86ISD::VROTRI";
26002 case X86ISD::VPPERM: return "X86ISD::VPPERM";
26003 case X86ISD::CMPP: return "X86ISD::CMPP";
26004 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
26005 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
26006 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
26007 case X86ISD::ADD: return "X86ISD::ADD";
26008 case X86ISD::SUB: return "X86ISD::SUB";
26009 case X86ISD::ADC: return "X86ISD::ADC";
26010 case X86ISD::SBB: return "X86ISD::SBB";
26011 case X86ISD::SMUL: return "X86ISD::SMUL";
26012 case X86ISD::UMUL: return "X86ISD::UMUL";
26013 case X86ISD::SMUL8: return "X86ISD::SMUL8";
26014 case X86ISD::UMUL8: return "X86ISD::UMUL8";
26015 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
26016 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
26017 case X86ISD::INC: return "X86ISD::INC";
26018 case X86ISD::DEC: return "X86ISD::DEC";
26019 case X86ISD::OR: return "X86ISD::OR";
26020 case X86ISD::XOR: return "X86ISD::XOR";
26021 case X86ISD::AND: return "X86ISD::AND";
26022 case X86ISD::BEXTR: return "X86ISD::BEXTR";
26023 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
26024 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
26025 case X86ISD::PTEST: return "X86ISD::PTEST";
26026 case X86ISD::TESTP: return "X86ISD::TESTP";
26027 case X86ISD::KORTEST: return "X86ISD::KORTEST";
26028 case X86ISD::KTEST: return "X86ISD::KTEST";
26029 case X86ISD::KADD: return "X86ISD::KADD";
26030 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
26031 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
26032 case X86ISD::PACKSS: return "X86ISD::PACKSS";
26033 case X86ISD::PACKUS: return "X86ISD::PACKUS";
26034 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
26035 case X86ISD::VALIGN: return "X86ISD::VALIGN";
26036 case X86ISD::VSHLD: return "X86ISD::VSHLD";
26037 case X86ISD::VSHRD: return "X86ISD::VSHRD";
26038 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
26039 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
26040 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
26041 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
26042 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
26043 case X86ISD::SHUFP: return "X86ISD::SHUFP";
26044 case X86ISD::SHUF128: return "X86ISD::SHUF128";
26045 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
26046 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
26047 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
26048 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
26049 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
26050 case X86ISD::MOVSD: return "X86ISD::MOVSD";
26051 case X86ISD::MOVSS: return "X86ISD::MOVSS";
26052 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
26053 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
26054 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
26055 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
26056 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
26057 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
26058 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
26059 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
26060 case X86ISD::VPERMV: return "X86ISD::VPERMV";
26061 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
26062 case X86ISD::VPERMI: return "X86ISD::VPERMI";
26063 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
26064 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
26065 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
26066 case X86ISD::VRANGE: return "X86ISD::VRANGE";
26067 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
26068 case X86ISD::VRANGES: return "X86ISD::VRANGES";
26069 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
26070 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
26071 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
26072 case X86ISD::PSADBW: return "X86ISD::PSADBW";
26073 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
26074 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
26075 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
26076 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
26077 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
26078 case X86ISD::MFENCE: return "X86ISD::MFENCE";
26079 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
26080 case X86ISD::SAHF: return "X86ISD::SAHF";
26081 case X86ISD::RDRAND: return "X86ISD::RDRAND";
26082 case X86ISD::RDSEED: return "X86ISD::RDSEED";
26083 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
26084 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
26085 case X86ISD::VPSHA: return "X86ISD::VPSHA";
26086 case X86ISD::VPSHL: return "X86ISD::VPSHL";
26087 case X86ISD::VPCOM: return "X86ISD::VPCOM";
26088 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
26089 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
26090 case X86ISD::FMSUB: return "X86ISD::FMSUB";
26091 case X86ISD::FNMADD: return "X86ISD::FNMADD";
26092 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
26093 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
26094 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
26095 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
26096 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
26097 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
26098 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
26099 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
26100 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
26101 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
26102 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
26103 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
26104 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
26105 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
26106 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
26107 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
26108 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
26109 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
26110 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
26111 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
26112 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
26113 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
26114 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
26115 case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
26116 case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
26117 case X86ISD::XTEST: return "X86ISD::XTEST";
26118 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
26119 case X86ISD::EXPAND: return "X86ISD::EXPAND";
26120 case X86ISD::SELECT: return "X86ISD::SELECT";
26121 case X86ISD::SELECTS: return "X86ISD::SELECTS";
26122 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
26123 case X86ISD::RCP14: return "X86ISD::RCP14";
26124 case X86ISD::RCP14S: return "X86ISD::RCP14S";
26125 case X86ISD::RCP28: return "X86ISD::RCP28";
26126 case X86ISD::RCP28S: return "X86ISD::RCP28S";
26127 case X86ISD::EXP2: return "X86ISD::EXP2";
26128 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
26129 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
26130 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
26131 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
26132 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
26133 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
26134 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
26135 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
26136 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
26137 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
26138 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
26139 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
26140 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
26141 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
26142 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
26143 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
26144 case X86ISD::SCALEF: return "X86ISD::SCALEF";
26145 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
26146 case X86ISD::ADDS: return "X86ISD::ADDS";
26147 case X86ISD::SUBS: return "X86ISD::SUBS";
26148 case X86ISD::AVG: return "X86ISD::AVG";
26149 case X86ISD::MULHRS: return "X86ISD::MULHRS";
26150 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
26151 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
26152 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
26153 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
26154 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
26155 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
26156 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
26157 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
26158 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
26159 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
26160 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
26161 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
26162 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
26163 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
26164 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
26165 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
26166 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
26167 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
26168 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
26169 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
26170 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
26171 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
26172 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
26173 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
26174 case X86ISD::LWPINS: return "X86ISD::LWPINS";
26175 case X86ISD::MGATHER: return "X86ISD::MGATHER";
26176 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
26177 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
26178 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
26179 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
26180 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
26181 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
26182 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
26183 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
26184 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
26185 case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
26186 case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
26187 case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
26188 case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
26193 /// Return true if the addressing mode represented by AM is legal for this
26194 /// target, for a load/store of the specified type.
26195 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
26196 const AddrMode &AM, Type *Ty,
26198 Instruction *I) const {
26199 // X86 supports extremely general addressing modes.
26200 CodeModel::Model M = getTargetMachine().getCodeModel();
26202 // X86 allows a sign-extended 32-bit immediate field as a displacement.
26203 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
26207 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
26209 // If a reference to this global requires an extra load, we can't fold it.
26210 if (isGlobalStubReference(GVFlags))
26213 // If BaseGV requires a register for the PIC base, we cannot also have a
26214 // BaseReg specified.
26215 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
26218 // If lower 4G is not available, then we must use rip-relative addressing.
26219 if ((M != CodeModel::Small || isPositionIndependent()) &&
26220 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
26224 switch (AM.Scale) {
26230 // These scales always work.
26235 // These scales are formed with basereg+scalereg. Only accept if there is
26240 default: // Other stuff never works.
26247 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
26248 unsigned Bits = Ty->getScalarSizeInBits();
26250 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
26251 // particularly cheaper than those without.
26255 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
26256 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
26257 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
26260 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
26261 // shifts just as cheap as scalar ones.
26262 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
26265 // AVX512BW has shifts such as vpsllvw.
26266 if (Subtarget.hasBWI() && Bits == 16)
26269 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
26270 // fully general vector.
26274 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
26275 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26277 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
26278 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
26279 return NumBits1 > NumBits2;
26282 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
26283 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26286 if (!isTypeLegal(EVT::getEVT(Ty1)))
26289 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
26291 // Assuming the caller doesn't have a zeroext or signext return parameter,
26292 // truncation all the way down to i1 is valid.
26296 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
26297 return isInt<32>(Imm);
26300 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
26301 // Can also use sub to handle negated immediates.
26302 return isInt<32>(Imm);
26305 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
26306 if (!VT1.isInteger() || !VT2.isInteger())
26308 unsigned NumBits1 = VT1.getSizeInBits();
26309 unsigned NumBits2 = VT2.getSizeInBits();
26310 return NumBits1 > NumBits2;
26313 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
26314 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26315 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
26318 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
26319 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26320 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
26323 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
26324 EVT VT1 = Val.getValueType();
26325 if (isZExtFree(VT1, VT2))
26328 if (Val.getOpcode() != ISD::LOAD)
26331 if (!VT1.isSimple() || !VT1.isInteger() ||
26332 !VT2.isSimple() || !VT2.isInteger())
26335 switch (VT1.getSimpleVT().SimpleTy) {
26340 // X86 has 8, 16, and 32-bit zero-extending loads.
26347 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
26348 EVT SrcVT = ExtVal.getOperand(0).getValueType();
26350 // There is no extending load for vXi1.
26351 if (SrcVT.getScalarType() == MVT::i1)
26358 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
26359 if (!Subtarget.hasAnyFMA())
26362 VT = VT.getScalarType();
26364 if (!VT.isSimple())
26367 switch (VT.getSimpleVT().SimpleTy) {
26378 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
26379 // i16 instructions are longer (0x66 prefix) and potentially slower.
26380 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
26383 /// Targets can use this to indicate that they only support *some*
26384 /// VECTOR_SHUFFLE operations, those with specific masks.
26385 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
26386 /// are assumed to be legal.
26387 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
26388 if (!VT.isSimple())
26391 // Not for i1 vectors
26392 if (VT.getSimpleVT().getScalarType() == MVT::i1)
26395 // Very little shuffling can be done for 64-bit vectors right now.
26396 if (VT.getSimpleVT().getSizeInBits() == 64)
26399 // We only care that the types being shuffled are legal. The lowering can
26400 // handle any possible shuffle mask that results.
26401 return isTypeLegal(VT.getSimpleVT());
26404 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
26406 // Don't convert an 'and' into a shuffle that we don't directly support.
26407 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
26408 if (!Subtarget.hasAVX2())
26409 if (VT == MVT::v32i8 || VT == MVT::v16i16)
26412 // Just delegate to the generic legality, clear masks aren't special.
26413 return isShuffleMaskLegal(Mask, VT);
26416 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
26417 // If the subtarget is using retpolines, we need to not generate jump tables.
26418 if (Subtarget.useRetpoline())
26421 // Otherwise, fallback on the generic logic.
26422 return TargetLowering::areJTsAllowed(Fn);
26425 //===----------------------------------------------------------------------===//
26426 // X86 Scheduler Hooks
26427 //===----------------------------------------------------------------------===//
26429 /// Utility function to emit xbegin specifying the start of an RTM region.
26430 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
26431 const TargetInstrInfo *TII) {
26432 DebugLoc DL = MI.getDebugLoc();
26434 const BasicBlock *BB = MBB->getBasicBlock();
26435 MachineFunction::iterator I = ++MBB->getIterator();
26437 // For the v = xbegin(), we generate
26446 // eax = # XABORT_DEF
26450 // v = phi(s0/mainBB, s1/fallBB)
26452 MachineBasicBlock *thisMBB = MBB;
26453 MachineFunction *MF = MBB->getParent();
26454 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26455 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
26456 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26457 MF->insert(I, mainMBB);
26458 MF->insert(I, fallMBB);
26459 MF->insert(I, sinkMBB);
26461 // Transfer the remainder of BB and its successor edges to sinkMBB.
26462 sinkMBB->splice(sinkMBB->begin(), MBB,
26463 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26464 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26466 MachineRegisterInfo &MRI = MF->getRegInfo();
26467 unsigned DstReg = MI.getOperand(0).getReg();
26468 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26469 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26470 unsigned fallDstReg = MRI.createVirtualRegister(RC);
26474 // # fallthrough to mainMBB
26475 // # abortion to fallMBB
26476 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
26477 thisMBB->addSuccessor(mainMBB);
26478 thisMBB->addSuccessor(fallMBB);
26481 // mainDstReg := -1
26482 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
26483 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26484 mainMBB->addSuccessor(sinkMBB);
26487 // ; pseudo instruction to model hardware's definition from XABORT
26488 // EAX := XABORT_DEF
26489 // fallDstReg := EAX
26490 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
26491 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
26493 fallMBB->addSuccessor(sinkMBB);
26496 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
26497 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
26498 .addReg(mainDstReg).addMBB(mainMBB)
26499 .addReg(fallDstReg).addMBB(fallMBB);
26501 MI.eraseFromParent();
26505 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26506 const X86Subtarget &Subtarget) {
26507 DebugLoc dl = MI.getDebugLoc();
26508 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26510 // insert input VAL into EAX
26511 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
26512 .addReg(MI.getOperand(0).getReg());
26513 // insert zero to ECX
26514 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26516 // insert zero to EDX
26517 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
26519 // insert WRPKRU instruction
26520 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
26522 MI.eraseFromParent(); // The pseudo is gone now.
26526 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26527 const X86Subtarget &Subtarget) {
26528 DebugLoc dl = MI.getDebugLoc();
26529 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26531 // insert zero to ECX
26532 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26534 // insert RDPKRU instruction
26535 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
26536 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
26539 MI.eraseFromParent(); // The pseudo is gone now.
26543 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
26544 const X86Subtarget &Subtarget,
26546 DebugLoc dl = MI.getDebugLoc();
26547 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26548 // Address into RAX/EAX, other two args into ECX, EDX.
26549 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26550 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26551 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26552 for (int i = 0; i < X86::AddrNumOperands; ++i)
26553 MIB.add(MI.getOperand(i));
26555 unsigned ValOps = X86::AddrNumOperands;
26556 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
26557 .addReg(MI.getOperand(ValOps).getReg());
26558 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
26559 .addReg(MI.getOperand(ValOps + 1).getReg());
26561 // The instruction doesn't actually take any operands though.
26562 BuildMI(*BB, MI, dl, TII->get(Opc));
26564 MI.eraseFromParent(); // The pseudo is gone now.
26568 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
26569 const X86Subtarget &Subtarget) {
26570 DebugLoc dl = MI->getDebugLoc();
26571 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26572 // Address into RAX/EAX
26573 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26574 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26575 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26576 for (int i = 0; i < X86::AddrNumOperands; ++i)
26577 MIB.add(MI->getOperand(i));
26579 // The instruction doesn't actually take any operands though.
26580 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
26582 MI->eraseFromParent(); // The pseudo is gone now.
26588 MachineBasicBlock *
26589 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
26590 MachineBasicBlock *MBB) const {
26591 // Emit va_arg instruction on X86-64.
26593 // Operands to this pseudo-instruction:
26594 // 0 ) Output : destination address (reg)
26595 // 1-5) Input : va_list address (addr, i64mem)
26596 // 6 ) ArgSize : Size (in bytes) of vararg type
26597 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
26598 // 8 ) Align : Alignment of type
26599 // 9 ) EFLAGS (implicit-def)
26601 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
26602 static_assert(X86::AddrNumOperands == 5,
26603 "VAARG_64 assumes 5 address operands");
26605 unsigned DestReg = MI.getOperand(0).getReg();
26606 MachineOperand &Base = MI.getOperand(1);
26607 MachineOperand &Scale = MI.getOperand(2);
26608 MachineOperand &Index = MI.getOperand(3);
26609 MachineOperand &Disp = MI.getOperand(4);
26610 MachineOperand &Segment = MI.getOperand(5);
26611 unsigned ArgSize = MI.getOperand(6).getImm();
26612 unsigned ArgMode = MI.getOperand(7).getImm();
26613 unsigned Align = MI.getOperand(8).getImm();
26615 // Memory Reference
26616 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
26617 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26618 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26620 // Machine Information
26621 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26622 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
26623 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
26624 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
26625 DebugLoc DL = MI.getDebugLoc();
26627 // struct va_list {
26630 // i64 overflow_area (address)
26631 // i64 reg_save_area (address)
26633 // sizeof(va_list) = 24
26634 // alignment(va_list) = 8
26636 unsigned TotalNumIntRegs = 6;
26637 unsigned TotalNumXMMRegs = 8;
26638 bool UseGPOffset = (ArgMode == 1);
26639 bool UseFPOffset = (ArgMode == 2);
26640 unsigned MaxOffset = TotalNumIntRegs * 8 +
26641 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
26643 /* Align ArgSize to a multiple of 8 */
26644 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
26645 bool NeedsAlign = (Align > 8);
26647 MachineBasicBlock *thisMBB = MBB;
26648 MachineBasicBlock *overflowMBB;
26649 MachineBasicBlock *offsetMBB;
26650 MachineBasicBlock *endMBB;
26652 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
26653 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
26654 unsigned OffsetReg = 0;
26656 if (!UseGPOffset && !UseFPOffset) {
26657 // If we only pull from the overflow region, we don't create a branch.
26658 // We don't need to alter control flow.
26659 OffsetDestReg = 0; // unused
26660 OverflowDestReg = DestReg;
26662 offsetMBB = nullptr;
26663 overflowMBB = thisMBB;
26666 // First emit code to check if gp_offset (or fp_offset) is below the bound.
26667 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
26668 // If not, pull from overflow_area. (branch to overflowMBB)
26673 // offsetMBB overflowMBB
26678 // Registers for the PHI in endMBB
26679 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
26680 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
26682 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26683 MachineFunction *MF = MBB->getParent();
26684 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26685 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26686 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26688 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26690 // Insert the new basic blocks
26691 MF->insert(MBBIter, offsetMBB);
26692 MF->insert(MBBIter, overflowMBB);
26693 MF->insert(MBBIter, endMBB);
26695 // Transfer the remainder of MBB and its successor edges to endMBB.
26696 endMBB->splice(endMBB->begin(), thisMBB,
26697 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
26698 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
26700 // Make offsetMBB and overflowMBB successors of thisMBB
26701 thisMBB->addSuccessor(offsetMBB);
26702 thisMBB->addSuccessor(overflowMBB);
26704 // endMBB is a successor of both offsetMBB and overflowMBB
26705 offsetMBB->addSuccessor(endMBB);
26706 overflowMBB->addSuccessor(endMBB);
26708 // Load the offset value into a register
26709 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26710 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
26714 .addDisp(Disp, UseFPOffset ? 4 : 0)
26716 .setMemRefs(MMOBegin, MMOEnd);
26718 // Check if there is enough room left to pull this argument.
26719 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
26721 .addImm(MaxOffset + 8 - ArgSizeA8);
26723 // Branch to "overflowMBB" if offset >= max
26724 // Fall through to "offsetMBB" otherwise
26725 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
26726 .addMBB(overflowMBB);
26729 // In offsetMBB, emit code to use the reg_save_area.
26731 assert(OffsetReg != 0);
26733 // Read the reg_save_area address.
26734 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
26735 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
26741 .setMemRefs(MMOBegin, MMOEnd);
26743 // Zero-extend the offset
26744 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
26745 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
26748 .addImm(X86::sub_32bit);
26750 // Add the offset to the reg_save_area to get the final address.
26751 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
26752 .addReg(OffsetReg64)
26753 .addReg(RegSaveReg);
26755 // Compute the offset for the next argument
26756 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26757 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
26759 .addImm(UseFPOffset ? 16 : 8);
26761 // Store it back into the va_list.
26762 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26766 .addDisp(Disp, UseFPOffset ? 4 : 0)
26768 .addReg(NextOffsetReg)
26769 .setMemRefs(MMOBegin, MMOEnd);
26772 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26777 // Emit code to use overflow area
26780 // Load the overflow_area address into a register.
26781 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26782 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26788 .setMemRefs(MMOBegin, MMOEnd);
26790 // If we need to align it, do so. Otherwise, just copy the address
26791 // to OverflowDestReg.
26793 // Align the overflow address
26794 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26795 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26797 // aligned_addr = (addr + (align-1)) & ~(align-1)
26798 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26799 .addReg(OverflowAddrReg)
26802 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26804 .addImm(~(uint64_t)(Align-1));
26806 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26807 .addReg(OverflowAddrReg);
26810 // Compute the next overflow address after this argument.
26811 // (the overflow address should be kept 8-byte aligned)
26812 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26813 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26814 .addReg(OverflowDestReg)
26815 .addImm(ArgSizeA8);
26817 // Store the new overflow address.
26818 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26824 .addReg(NextAddrReg)
26825 .setMemRefs(MMOBegin, MMOEnd);
26827 // If we branched, emit the PHI to the front of endMBB.
26829 BuildMI(*endMBB, endMBB->begin(), DL,
26830 TII->get(X86::PHI), DestReg)
26831 .addReg(OffsetDestReg).addMBB(offsetMBB)
26832 .addReg(OverflowDestReg).addMBB(overflowMBB);
26835 // Erase the pseudo instruction
26836 MI.eraseFromParent();
26841 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26842 MachineInstr &MI, MachineBasicBlock *MBB) const {
26843 // Emit code to save XMM registers to the stack. The ABI says that the
26844 // number of registers to save is given in %al, so it's theoretically
26845 // possible to do an indirect jump trick to avoid saving all of them,
26846 // however this code takes a simpler approach and just executes all
26847 // of the stores if %al is non-zero. It's less code, and it's probably
26848 // easier on the hardware branch predictor, and stores aren't all that
26849 // expensive anyway.
26851 // Create the new basic blocks. One block contains all the XMM stores,
26852 // and one block is the final destination regardless of whether any
26853 // stores were performed.
26854 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26855 MachineFunction *F = MBB->getParent();
26856 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26857 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26858 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26859 F->insert(MBBIter, XMMSaveMBB);
26860 F->insert(MBBIter, EndMBB);
26862 // Transfer the remainder of MBB and its successor edges to EndMBB.
26863 EndMBB->splice(EndMBB->begin(), MBB,
26864 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26865 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26867 // The original block will now fall through to the XMM save block.
26868 MBB->addSuccessor(XMMSaveMBB);
26869 // The XMMSaveMBB will fall through to the end block.
26870 XMMSaveMBB->addSuccessor(EndMBB);
26872 // Now add the instructions.
26873 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26874 DebugLoc DL = MI.getDebugLoc();
26876 unsigned CountReg = MI.getOperand(0).getReg();
26877 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26878 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26880 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26881 // If %al is 0, branch around the XMM save block.
26882 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26883 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26884 MBB->addSuccessor(EndMBB);
26887 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26888 // that was just emitted, but clearly shouldn't be "saved".
26889 assert((MI.getNumOperands() <= 3 ||
26890 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26891 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26892 "Expected last argument to be EFLAGS");
26893 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26894 // In the XMM save block, save all the XMM argument registers.
26895 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26896 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26897 MachineMemOperand *MMO = F->getMachineMemOperand(
26898 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26899 MachineMemOperand::MOStore,
26900 /*Size=*/16, /*Align=*/16);
26901 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26902 .addFrameIndex(RegSaveFrameIndex)
26903 .addImm(/*Scale=*/1)
26904 .addReg(/*IndexReg=*/0)
26905 .addImm(/*Disp=*/Offset)
26906 .addReg(/*Segment=*/0)
26907 .addReg(MI.getOperand(i).getReg())
26908 .addMemOperand(MMO);
26911 MI.eraseFromParent(); // The pseudo instruction is gone now.
26916 // The EFLAGS operand of SelectItr might be missing a kill marker
26917 // because there were multiple uses of EFLAGS, and ISel didn't know
26918 // which to mark. Figure out whether SelectItr should have had a
26919 // kill marker, and set it if it should. Returns the correct kill
26921 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26922 MachineBasicBlock* BB,
26923 const TargetRegisterInfo* TRI) {
26924 // Scan forward through BB for a use/def of EFLAGS.
26925 MachineBasicBlock::iterator miI(std::next(SelectItr));
26926 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26927 const MachineInstr& mi = *miI;
26928 if (mi.readsRegister(X86::EFLAGS))
26930 if (mi.definesRegister(X86::EFLAGS))
26931 break; // Should have kill-flag - update below.
26934 // If we hit the end of the block, check whether EFLAGS is live into a
26936 if (miI == BB->end()) {
26937 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26938 sEnd = BB->succ_end();
26939 sItr != sEnd; ++sItr) {
26940 MachineBasicBlock* succ = *sItr;
26941 if (succ->isLiveIn(X86::EFLAGS))
26946 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26947 // out. SelectMI should have a kill flag on EFLAGS.
26948 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26952 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26953 // together with other CMOV pseudo-opcodes into a single basic-block with
26954 // conditional jump around it.
26955 static bool isCMOVPseudo(MachineInstr &MI) {
26956 switch (MI.getOpcode()) {
26957 case X86::CMOV_FR32:
26958 case X86::CMOV_FR64:
26959 case X86::CMOV_GR8:
26960 case X86::CMOV_GR16:
26961 case X86::CMOV_GR32:
26962 case X86::CMOV_RFP32:
26963 case X86::CMOV_RFP64:
26964 case X86::CMOV_RFP80:
26965 case X86::CMOV_V2F64:
26966 case X86::CMOV_V2I64:
26967 case X86::CMOV_V4F32:
26968 case X86::CMOV_V4F64:
26969 case X86::CMOV_V4I64:
26970 case X86::CMOV_V16F32:
26971 case X86::CMOV_V8F32:
26972 case X86::CMOV_V8F64:
26973 case X86::CMOV_V8I64:
26974 case X86::CMOV_V8I1:
26975 case X86::CMOV_V16I1:
26976 case X86::CMOV_V32I1:
26977 case X86::CMOV_V64I1:
26985 // Helper function, which inserts PHI functions into SinkMBB:
26986 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26987 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26988 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26989 // the last PHI function inserted.
26990 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26991 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26992 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26993 MachineBasicBlock *SinkMBB) {
26994 MachineFunction *MF = TrueMBB->getParent();
26995 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
26996 DebugLoc DL = MIItBegin->getDebugLoc();
26998 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
26999 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
27001 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
27003 // As we are creating the PHIs, we have to be careful if there is more than
27004 // one. Later CMOVs may reference the results of earlier CMOVs, but later
27005 // PHIs have to reference the individual true/false inputs from earlier PHIs.
27006 // That also means that PHI construction must work forward from earlier to
27007 // later, and that the code must maintain a mapping from earlier PHI's
27008 // destination registers, and the registers that went into the PHI.
27009 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
27010 MachineInstrBuilder MIB;
27012 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
27013 unsigned DestReg = MIIt->getOperand(0).getReg();
27014 unsigned Op1Reg = MIIt->getOperand(1).getReg();
27015 unsigned Op2Reg = MIIt->getOperand(2).getReg();
27017 // If this CMOV we are generating is the opposite condition from
27018 // the jump we generated, then we have to swap the operands for the
27019 // PHI that is going to be generated.
27020 if (MIIt->getOperand(3).getImm() == OppCC)
27021 std::swap(Op1Reg, Op2Reg);
27023 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
27024 Op1Reg = RegRewriteTable[Op1Reg].first;
27026 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
27027 Op2Reg = RegRewriteTable[Op2Reg].second;
27029 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
27035 // Add this PHI to the rewrite table.
27036 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
27042 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
27043 MachineBasicBlock *
27044 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
27045 MachineInstr &SecondCascadedCMOV,
27046 MachineBasicBlock *ThisMBB) const {
27047 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27048 DebugLoc DL = FirstCMOV.getDebugLoc();
27050 // We lower cascaded CMOVs such as
27052 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
27054 // to two successive branches.
27056 // Without this, we would add a PHI between the two jumps, which ends up
27057 // creating a few copies all around. For instance, for
27059 // (sitofp (zext (fcmp une)))
27061 // we would generate:
27063 // ucomiss %xmm1, %xmm0
27064 // movss <1.0f>, %xmm0
27065 // movaps %xmm0, %xmm1
27067 // xorps %xmm1, %xmm1
27070 // movaps %xmm1, %xmm0
27074 // because this custom-inserter would have generated:
27086 // A: X = ...; Y = ...
27088 // C: Z = PHI [X, A], [Y, B]
27090 // E: PHI [X, C], [Z, D]
27092 // If we lower both CMOVs in a single step, we can instead generate:
27104 // A: X = ...; Y = ...
27106 // E: PHI [X, A], [X, C], [Y, D]
27108 // Which, in our sitofp/fcmp example, gives us something like:
27110 // ucomiss %xmm1, %xmm0
27111 // movss <1.0f>, %xmm0
27114 // xorps %xmm0, %xmm0
27119 // We lower cascaded CMOV into two successive branches to the same block.
27120 // EFLAGS is used by both, so mark it as live in the second.
27121 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27122 MachineFunction *F = ThisMBB->getParent();
27123 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27124 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27125 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27127 MachineFunction::iterator It = ++ThisMBB->getIterator();
27128 F->insert(It, FirstInsertedMBB);
27129 F->insert(It, SecondInsertedMBB);
27130 F->insert(It, SinkMBB);
27132 // For a cascaded CMOV, we lower it to two successive branches to
27133 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
27134 // the FirstInsertedMBB.
27135 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
27137 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27138 // live into the sink and copy blocks.
27139 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27140 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
27141 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
27142 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
27143 SinkMBB->addLiveIn(X86::EFLAGS);
27146 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27147 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27148 std::next(MachineBasicBlock::iterator(FirstCMOV)),
27150 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27152 // Fallthrough block for ThisMBB.
27153 ThisMBB->addSuccessor(FirstInsertedMBB);
27154 // The true block target of the first branch is always SinkMBB.
27155 ThisMBB->addSuccessor(SinkMBB);
27156 // Fallthrough block for FirstInsertedMBB.
27157 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
27158 // The true block for the branch of FirstInsertedMBB.
27159 FirstInsertedMBB->addSuccessor(SinkMBB);
27160 // This is fallthrough.
27161 SecondInsertedMBB->addSuccessor(SinkMBB);
27163 // Create the conditional branch instructions.
27164 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
27165 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
27166 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27168 X86::CondCode SecondCC =
27169 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
27170 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
27171 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
27174 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
27175 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
27176 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
27177 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
27178 MachineInstrBuilder MIB =
27179 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
27181 .addMBB(SecondInsertedMBB)
27185 // The second SecondInsertedMBB provides the same incoming value as the
27186 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
27187 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
27188 // Copy the PHI result to the register defined by the second CMOV.
27189 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
27190 TII->get(TargetOpcode::COPY),
27191 SecondCascadedCMOV.getOperand(0).getReg())
27192 .addReg(FirstCMOV.getOperand(0).getReg());
27194 // Now remove the CMOVs.
27195 FirstCMOV.eraseFromParent();
27196 SecondCascadedCMOV.eraseFromParent();
27201 MachineBasicBlock *
27202 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
27203 MachineBasicBlock *ThisMBB) const {
27204 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27205 DebugLoc DL = MI.getDebugLoc();
27207 // To "insert" a SELECT_CC instruction, we actually have to insert the
27208 // diamond control-flow pattern. The incoming instruction knows the
27209 // destination vreg to set, the condition code register to branch on, the
27210 // true/false values to select between and a branch opcode to use.
27215 // cmpTY ccX, r1, r2
27217 // fallthrough --> FalseMBB
27219 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
27220 // as described above, by inserting a BB, and then making a PHI at the join
27221 // point to select the true and false operands of the CMOV in the PHI.
27223 // The code also handles two different cases of multiple CMOV opcodes
27227 // In this case, there are multiple CMOVs in a row, all which are based on
27228 // the same condition setting (or the exact opposite condition setting).
27229 // In this case we can lower all the CMOVs using a single inserted BB, and
27230 // then make a number of PHIs at the join point to model the CMOVs. The only
27231 // trickiness here, is that in a case like:
27233 // t2 = CMOV cond1 t1, f1
27234 // t3 = CMOV cond1 t2, f2
27236 // when rewriting this into PHIs, we have to perform some renaming on the
27237 // temps since you cannot have a PHI operand refer to a PHI result earlier
27238 // in the same block. The "simple" but wrong lowering would be:
27240 // t2 = PHI t1(BB1), f1(BB2)
27241 // t3 = PHI t2(BB1), f2(BB2)
27243 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
27244 // renaming is to note that on the path through BB1, t2 is really just a
27245 // copy of t1, and do that renaming, properly generating:
27247 // t2 = PHI t1(BB1), f1(BB2)
27248 // t3 = PHI t1(BB1), f2(BB2)
27251 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
27252 // function - EmitLoweredCascadedSelect.
27254 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
27255 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
27256 MachineInstr *LastCMOV = &MI;
27257 MachineBasicBlock::iterator NextMIIt =
27258 std::next(MachineBasicBlock::iterator(MI));
27260 // Check for case 1, where there are multiple CMOVs with the same condition
27261 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
27262 // number of jumps the most.
27264 if (isCMOVPseudo(MI)) {
27265 // See if we have a string of CMOVS with the same condition.
27266 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
27267 (NextMIIt->getOperand(3).getImm() == CC ||
27268 NextMIIt->getOperand(3).getImm() == OppCC)) {
27269 LastCMOV = &*NextMIIt;
27274 // This checks for case 2, but only do this if we didn't already find
27275 // case 1, as indicated by LastCMOV == MI.
27276 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
27277 NextMIIt->getOpcode() == MI.getOpcode() &&
27278 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
27279 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
27280 NextMIIt->getOperand(1).isKill()) {
27281 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
27284 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27285 MachineFunction *F = ThisMBB->getParent();
27286 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
27287 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27289 MachineFunction::iterator It = ++ThisMBB->getIterator();
27290 F->insert(It, FalseMBB);
27291 F->insert(It, SinkMBB);
27293 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27294 // live into the sink and copy blocks.
27295 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27296 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
27297 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
27298 FalseMBB->addLiveIn(X86::EFLAGS);
27299 SinkMBB->addLiveIn(X86::EFLAGS);
27302 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27303 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27304 std::next(MachineBasicBlock::iterator(LastCMOV)),
27306 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27308 // Fallthrough block for ThisMBB.
27309 ThisMBB->addSuccessor(FalseMBB);
27310 // The true block target of the first (or only) branch is always a SinkMBB.
27311 ThisMBB->addSuccessor(SinkMBB);
27312 // Fallthrough block for FalseMBB.
27313 FalseMBB->addSuccessor(SinkMBB);
27315 // Create the conditional branch instruction.
27316 unsigned Opc = X86::GetCondBranchFromCond(CC);
27317 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27320 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
27322 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
27323 MachineBasicBlock::iterator MIItEnd =
27324 std::next(MachineBasicBlock::iterator(LastCMOV));
27325 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
27327 // Now remove the CMOV(s).
27328 ThisMBB->erase(MIItBegin, MIItEnd);
27333 MachineBasicBlock *
27334 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
27335 MachineBasicBlock *BB) const {
27336 // Combine the following atomic floating-point modification pattern:
27337 // a.store(reg OP a.load(acquire), release)
27338 // Transform them into:
27339 // OPss (%gpr), %xmm
27340 // movss %xmm, (%gpr)
27341 // Or sd equivalent for 64-bit operations.
27343 switch (MI.getOpcode()) {
27344 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
27345 case X86::RELEASE_FADD32mr:
27346 FOp = X86::ADDSSrm;
27347 MOp = X86::MOVSSmr;
27349 case X86::RELEASE_FADD64mr:
27350 FOp = X86::ADDSDrm;
27351 MOp = X86::MOVSDmr;
27354 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27355 DebugLoc DL = MI.getDebugLoc();
27356 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
27357 unsigned ValOpIdx = X86::AddrNumOperands;
27358 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
27359 MachineInstrBuilder MIB =
27360 BuildMI(*BB, MI, DL, TII->get(FOp),
27361 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
27363 for (int i = 0; i < X86::AddrNumOperands; ++i) {
27364 MachineOperand &Operand = MI.getOperand(i);
27365 // Clear any kill flags on register operands as we'll create a second
27366 // instruction using the same address operands.
27367 if (Operand.isReg())
27368 Operand.setIsKill(false);
27371 MachineInstr *FOpMI = MIB;
27372 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
27373 for (int i = 0; i < X86::AddrNumOperands; ++i)
27374 MIB.add(MI.getOperand(i));
27375 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
27376 MI.eraseFromParent(); // The pseudo instruction is gone now.
27380 MachineBasicBlock *
27381 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
27382 MachineBasicBlock *BB) const {
27383 MachineFunction *MF = BB->getParent();
27384 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27385 DebugLoc DL = MI.getDebugLoc();
27386 const BasicBlock *LLVM_BB = BB->getBasicBlock();
27388 assert(MF->shouldSplitStack());
27390 const bool Is64Bit = Subtarget.is64Bit();
27391 const bool IsLP64 = Subtarget.isTarget64BitLP64();
27393 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
27394 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
27397 // ... [Till the alloca]
27398 // If stacklet is not large enough, jump to mallocMBB
27401 // Allocate by subtracting from RSP
27402 // Jump to continueMBB
27405 // Allocate by call to runtime
27409 // [rest of original BB]
27412 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27413 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27414 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27416 MachineRegisterInfo &MRI = MF->getRegInfo();
27417 const TargetRegisterClass *AddrRegClass =
27418 getRegClassFor(getPointerTy(MF->getDataLayout()));
27420 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27421 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27422 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
27423 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
27424 sizeVReg = MI.getOperand(1).getReg(),
27426 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
27428 MachineFunction::iterator MBBIter = ++BB->getIterator();
27430 MF->insert(MBBIter, bumpMBB);
27431 MF->insert(MBBIter, mallocMBB);
27432 MF->insert(MBBIter, continueMBB);
27434 continueMBB->splice(continueMBB->begin(), BB,
27435 std::next(MachineBasicBlock::iterator(MI)), BB->end());
27436 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
27438 // Add code to the main basic block to check if the stack limit has been hit,
27439 // and if so, jump to mallocMBB otherwise to bumpMBB.
27440 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
27441 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
27442 .addReg(tmpSPVReg).addReg(sizeVReg);
27443 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
27444 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
27445 .addReg(SPLimitVReg);
27446 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
27448 // bumpMBB simply decreases the stack pointer, since we know the current
27449 // stacklet has enough space.
27450 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
27451 .addReg(SPLimitVReg);
27452 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
27453 .addReg(SPLimitVReg);
27454 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27456 // Calls into a routine in libgcc to allocate more space from the heap.
27457 const uint32_t *RegMask =
27458 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
27460 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
27462 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27463 .addExternalSymbol("__morestack_allocate_stack_space")
27464 .addRegMask(RegMask)
27465 .addReg(X86::RDI, RegState::Implicit)
27466 .addReg(X86::RAX, RegState::ImplicitDefine);
27467 } else if (Is64Bit) {
27468 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
27470 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27471 .addExternalSymbol("__morestack_allocate_stack_space")
27472 .addRegMask(RegMask)
27473 .addReg(X86::EDI, RegState::Implicit)
27474 .addReg(X86::EAX, RegState::ImplicitDefine);
27476 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
27478 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
27479 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
27480 .addExternalSymbol("__morestack_allocate_stack_space")
27481 .addRegMask(RegMask)
27482 .addReg(X86::EAX, RegState::ImplicitDefine);
27486 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
27489 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
27490 .addReg(IsLP64 ? X86::RAX : X86::EAX);
27491 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27493 // Set up the CFG correctly.
27494 BB->addSuccessor(bumpMBB);
27495 BB->addSuccessor(mallocMBB);
27496 mallocMBB->addSuccessor(continueMBB);
27497 bumpMBB->addSuccessor(continueMBB);
27499 // Take care of the PHI nodes.
27500 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
27501 MI.getOperand(0).getReg())
27502 .addReg(mallocPtrVReg)
27504 .addReg(bumpSPPtrVReg)
27507 // Delete the original pseudo instruction.
27508 MI.eraseFromParent();
27511 return continueMBB;
27514 MachineBasicBlock *
27515 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
27516 MachineBasicBlock *BB) const {
27517 MachineFunction *MF = BB->getParent();
27518 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27519 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
27520 DebugLoc DL = MI.getDebugLoc();
27522 assert(!isAsynchronousEHPersonality(
27523 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
27524 "SEH does not use catchret!");
27526 // Only 32-bit EH needs to worry about manually restoring stack pointers.
27527 if (!Subtarget.is32Bit())
27530 // C++ EH creates a new target block to hold the restore code, and wires up
27531 // the new block to the return destination with a normal JMP_4.
27532 MachineBasicBlock *RestoreMBB =
27533 MF->CreateMachineBasicBlock(BB->getBasicBlock());
27534 assert(BB->succ_size() == 1);
27535 MF->insert(std::next(BB->getIterator()), RestoreMBB);
27536 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
27537 BB->addSuccessor(RestoreMBB);
27538 MI.getOperand(0).setMBB(RestoreMBB);
27540 auto RestoreMBBI = RestoreMBB->begin();
27541 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
27542 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
27546 MachineBasicBlock *
27547 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
27548 MachineBasicBlock *BB) const {
27549 MachineFunction *MF = BB->getParent();
27550 const Constant *PerFn = MF->getFunction().getPersonalityFn();
27551 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
27552 // Only 32-bit SEH requires special handling for catchpad.
27553 if (IsSEH && Subtarget.is32Bit()) {
27554 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27555 DebugLoc DL = MI.getDebugLoc();
27556 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
27558 MI.eraseFromParent();
27562 MachineBasicBlock *
27563 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
27564 MachineBasicBlock *BB) const {
27565 // So, here we replace TLSADDR with the sequence:
27566 // adjust_stackdown -> TLSADDR -> adjust_stackup.
27567 // We need this because TLSADDR is lowered into calls
27568 // inside MC, therefore without the two markers shrink-wrapping
27569 // may push the prologue/epilogue pass them.
27570 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27571 DebugLoc DL = MI.getDebugLoc();
27572 MachineFunction &MF = *BB->getParent();
27574 // Emit CALLSEQ_START right before the instruction.
27575 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
27576 MachineInstrBuilder CallseqStart =
27577 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
27578 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
27580 // Emit CALLSEQ_END right after the instruction.
27581 // We don't call erase from parent because we want to keep the
27582 // original instruction around.
27583 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
27584 MachineInstrBuilder CallseqEnd =
27585 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
27586 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
27591 MachineBasicBlock *
27592 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
27593 MachineBasicBlock *BB) const {
27594 // This is pretty easy. We're taking the value that we received from
27595 // our load from the relocation, sticking it in either RDI (x86-64)
27596 // or EAX and doing an indirect call. The return value will then
27597 // be in the normal return register.
27598 MachineFunction *F = BB->getParent();
27599 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27600 DebugLoc DL = MI.getDebugLoc();
27602 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
27603 assert(MI.getOperand(3).isGlobal() && "This should be a global");
27605 // Get a register mask for the lowered call.
27606 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
27607 // proper register mask.
27608 const uint32_t *RegMask =
27609 Subtarget.is64Bit() ?
27610 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
27611 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
27612 if (Subtarget.is64Bit()) {
27613 MachineInstrBuilder MIB =
27614 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
27618 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27619 MI.getOperand(3).getTargetFlags())
27621 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
27622 addDirectMem(MIB, X86::RDI);
27623 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
27624 } else if (!isPositionIndependent()) {
27625 MachineInstrBuilder MIB =
27626 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27630 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27631 MI.getOperand(3).getTargetFlags())
27633 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27634 addDirectMem(MIB, X86::EAX);
27635 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27637 MachineInstrBuilder MIB =
27638 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27639 .addReg(TII->getGlobalBaseReg(F))
27642 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27643 MI.getOperand(3).getTargetFlags())
27645 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27646 addDirectMem(MIB, X86::EAX);
27647 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27650 MI.eraseFromParent(); // The pseudo instruction is gone now.
27654 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
27656 case X86::RETPOLINE_CALL32:
27657 return X86::CALLpcrel32;
27658 case X86::RETPOLINE_CALL64:
27659 return X86::CALL64pcrel32;
27660 case X86::RETPOLINE_TCRETURN32:
27661 return X86::TCRETURNdi;
27662 case X86::RETPOLINE_TCRETURN64:
27663 return X86::TCRETURNdi64;
27665 llvm_unreachable("not retpoline opcode");
27668 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
27670 if (Subtarget.useRetpolineExternalThunk()) {
27671 // When using an external thunk for retpolines, we pick names that match the
27672 // names GCC happens to use as well. This helps simplify the implementation
27673 // of the thunks for kernels where they have no easy ability to create
27674 // aliases and are doing non-trivial configuration of the thunk's body. For
27675 // example, the Linux kernel will do boot-time hot patching of the thunk
27676 // bodies and cannot easily export aliases of these to loaded modules.
27678 // Note that at any point in the future, we may need to change the semantics
27679 // of how we implement retpolines and at that time will likely change the
27680 // name of the called thunk. Essentially, there is no hard guarantee that
27681 // LLVM will generate calls to specific thunks, we merely make a best-effort
27682 // attempt to help out kernels and other systems where duplicating the
27683 // thunks is costly.
27686 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27687 return "__x86_indirect_thunk_eax";
27689 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27690 return "__x86_indirect_thunk_ecx";
27692 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27693 return "__x86_indirect_thunk_edx";
27695 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27696 return "__x86_indirect_thunk_edi";
27698 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27699 return "__x86_indirect_thunk_r11";
27701 llvm_unreachable("unexpected reg for retpoline");
27704 // When targeting an internal COMDAT thunk use an LLVM-specific name.
27707 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27708 return "__llvm_retpoline_eax";
27710 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27711 return "__llvm_retpoline_ecx";
27713 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27714 return "__llvm_retpoline_edx";
27716 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27717 return "__llvm_retpoline_edi";
27719 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27720 return "__llvm_retpoline_r11";
27722 llvm_unreachable("unexpected reg for retpoline");
27725 MachineBasicBlock *
27726 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
27727 MachineBasicBlock *BB) const {
27728 // Copy the virtual register into the R11 physical register and
27729 // call the retpoline thunk.
27730 DebugLoc DL = MI.getDebugLoc();
27731 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27732 unsigned CalleeVReg = MI.getOperand(0).getReg();
27733 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
27735 // Find an available scratch register to hold the callee. On 64-bit, we can
27736 // just use R11, but we scan for uses anyway to ensure we don't generate
27737 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
27738 // already a register use operand to the call to hold the callee. If none
27739 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
27740 // register and ESI is the base pointer to realigned stack frames with VLAs.
27741 SmallVector<unsigned, 3> AvailableRegs;
27742 if (Subtarget.is64Bit())
27743 AvailableRegs.push_back(X86::R11);
27745 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
27747 // Zero out any registers that are already used.
27748 for (const auto &MO : MI.operands()) {
27749 if (MO.isReg() && MO.isUse())
27750 for (unsigned &Reg : AvailableRegs)
27751 if (Reg == MO.getReg())
27755 // Choose the first remaining non-zero available register.
27756 unsigned AvailableReg = 0;
27757 for (unsigned MaybeReg : AvailableRegs) {
27759 AvailableReg = MaybeReg;
27764 report_fatal_error("calling convention incompatible with retpoline, no "
27765 "available registers");
27767 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
27769 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
27770 .addReg(CalleeVReg);
27771 MI.getOperand(0).ChangeToES(Symbol);
27772 MI.setDesc(TII->get(Opc));
27773 MachineInstrBuilder(*BB->getParent(), &MI)
27774 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
27778 /// SetJmp implies future control flow change upon calling the corresponding
27780 /// Instead of using the 'return' instruction, the long jump fixes the stack and
27781 /// performs an indirect branch. To do so it uses the registers that were stored
27782 /// in the jump buffer (when calling SetJmp).
27783 /// In case the shadow stack is enabled we need to fix it as well, because some
27784 /// return addresses will be skipped.
27785 /// The function will save the SSP for future fixing in the function
27786 /// emitLongJmpShadowStackFix.
27787 /// \sa emitLongJmpShadowStackFix
27788 /// \param [in] MI The temporary Machine Instruction for the builtin.
27789 /// \param [in] MBB The Machine Basic Block that will be modified.
27790 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
27791 MachineBasicBlock *MBB) const {
27792 DebugLoc DL = MI.getDebugLoc();
27793 MachineFunction *MF = MBB->getParent();
27794 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27795 MachineRegisterInfo &MRI = MF->getRegInfo();
27796 MachineInstrBuilder MIB;
27798 // Memory Reference.
27799 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27800 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27802 // Initialize a register with zero.
27803 MVT PVT = getPointerTy(MF->getDataLayout());
27804 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27805 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
27806 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
27807 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
27809 .addReg(ZReg, RegState::Undef)
27810 .addReg(ZReg, RegState::Undef);
27812 // Read the current SSP Register value to the zeroed register.
27813 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
27814 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
27815 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
27817 // Write the SSP register value to offset 3 in input memory buffer.
27818 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27819 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
27820 const int64_t SSPOffset = 3 * PVT.getStoreSize();
27821 const unsigned MemOpndSlot = 1;
27822 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27823 if (i == X86::AddrDisp)
27824 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
27826 MIB.add(MI.getOperand(MemOpndSlot + i));
27828 MIB.addReg(SSPCopyReg);
27829 MIB.setMemRefs(MMOBegin, MMOEnd);
27832 MachineBasicBlock *
27833 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
27834 MachineBasicBlock *MBB) const {
27835 DebugLoc DL = MI.getDebugLoc();
27836 MachineFunction *MF = MBB->getParent();
27837 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27838 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27839 MachineRegisterInfo &MRI = MF->getRegInfo();
27841 const BasicBlock *BB = MBB->getBasicBlock();
27842 MachineFunction::iterator I = ++MBB->getIterator();
27844 // Memory Reference
27845 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27846 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27849 unsigned MemOpndSlot = 0;
27851 unsigned CurOp = 0;
27853 DstReg = MI.getOperand(CurOp++).getReg();
27854 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27855 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
27857 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27858 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
27860 MemOpndSlot = CurOp;
27862 MVT PVT = getPointerTy(MF->getDataLayout());
27863 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27864 "Invalid Pointer Size!");
27866 // For v = setjmp(buf), we generate
27869 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
27870 // SjLjSetup restoreMBB
27876 // v = phi(main, restore)
27879 // if base pointer being used, load it from frame
27882 MachineBasicBlock *thisMBB = MBB;
27883 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27884 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27885 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
27886 MF->insert(I, mainMBB);
27887 MF->insert(I, sinkMBB);
27888 MF->push_back(restoreMBB);
27889 restoreMBB->setHasAddressTaken();
27891 MachineInstrBuilder MIB;
27893 // Transfer the remainder of BB and its successor edges to sinkMBB.
27894 sinkMBB->splice(sinkMBB->begin(), MBB,
27895 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27896 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27899 unsigned PtrStoreOpc = 0;
27900 unsigned LabelReg = 0;
27901 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27902 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27903 !isPositionIndependent();
27905 // Prepare IP either in reg or imm.
27906 if (!UseImmLabel) {
27907 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27908 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27909 LabelReg = MRI.createVirtualRegister(PtrRC);
27910 if (Subtarget.is64Bit()) {
27911 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
27915 .addMBB(restoreMBB)
27918 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
27919 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
27920 .addReg(XII->getGlobalBaseReg(MF))
27923 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
27927 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27929 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
27930 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27931 if (i == X86::AddrDisp)
27932 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
27934 MIB.add(MI.getOperand(MemOpndSlot + i));
27937 MIB.addReg(LabelReg);
27939 MIB.addMBB(restoreMBB);
27940 MIB.setMemRefs(MMOBegin, MMOEnd);
27942 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
27943 emitSetJmpShadowStackFix(MI, thisMBB);
27947 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27948 .addMBB(restoreMBB);
27950 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27951 MIB.addRegMask(RegInfo->getNoPreservedMask());
27952 thisMBB->addSuccessor(mainMBB);
27953 thisMBB->addSuccessor(restoreMBB);
27957 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27958 mainMBB->addSuccessor(sinkMBB);
27961 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27962 TII->get(X86::PHI), DstReg)
27963 .addReg(mainDstReg).addMBB(mainMBB)
27964 .addReg(restoreDstReg).addMBB(restoreMBB);
27967 if (RegInfo->hasBasePointer(*MF)) {
27968 const bool Uses64BitFramePtr =
27969 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27970 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27971 X86FI->setRestoreBasePointer(MF);
27972 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27973 unsigned BasePtr = RegInfo->getBaseRegister();
27974 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27975 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27976 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27977 .setMIFlag(MachineInstr::FrameSetup);
27979 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
27980 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
27981 restoreMBB->addSuccessor(sinkMBB);
27983 MI.eraseFromParent();
27987 /// Fix the shadow stack using the previously saved SSP pointer.
27988 /// \sa emitSetJmpShadowStackFix
27989 /// \param [in] MI The temporary Machine Instruction for the builtin.
27990 /// \param [in] MBB The Machine Basic Block that will be modified.
27991 /// \return The sink MBB that will perform the future indirect branch.
27992 MachineBasicBlock *
27993 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
27994 MachineBasicBlock *MBB) const {
27995 DebugLoc DL = MI.getDebugLoc();
27996 MachineFunction *MF = MBB->getParent();
27997 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27998 MachineRegisterInfo &MRI = MF->getRegInfo();
28000 // Memory Reference
28001 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
28002 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
28004 MVT PVT = getPointerTy(MF->getDataLayout());
28005 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
28008 // xor vreg1, vreg1
28010 // test vreg1, vreg1
28011 // je sinkMBB # Jump if Shadow Stack is not supported
28013 // mov buf+24/12(%rip), vreg2
28014 // sub vreg1, vreg2
28015 // jbe sinkMBB # No need to fix the Shadow Stack
28018 // incssp vreg2 # fix the SSP according to the lower 8 bits
28021 // fixShadowLoopPrepareMBB:
28024 // fixShadowLoopMBB:
28027 // jne fixShadowLoopMBB # Iterate until you finish fixing
28028 // # the Shadow Stack
28031 MachineFunction::iterator I = ++MBB->getIterator();
28032 const BasicBlock *BB = MBB->getBasicBlock();
28034 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
28035 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
28036 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
28037 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
28038 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
28039 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
28040 MF->insert(I, checkSspMBB);
28041 MF->insert(I, fallMBB);
28042 MF->insert(I, fixShadowMBB);
28043 MF->insert(I, fixShadowLoopPrepareMBB);
28044 MF->insert(I, fixShadowLoopMBB);
28045 MF->insert(I, sinkMBB);
28047 // Transfer the remainder of BB and its successor edges to sinkMBB.
28048 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
28050 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
28052 MBB->addSuccessor(checkSspMBB);
28054 // Initialize a register with zero.
28055 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
28056 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
28057 BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
28059 .addReg(ZReg, RegState::Undef)
28060 .addReg(ZReg, RegState::Undef);
28062 // Read the current SSP Register value to the zeroed register.
28063 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
28064 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
28065 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
28067 // Check whether the result of the SSP register is zero and jump directly
28069 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
28070 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
28071 .addReg(SSPCopyReg)
28072 .addReg(SSPCopyReg);
28073 BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28074 checkSspMBB->addSuccessor(sinkMBB);
28075 checkSspMBB->addSuccessor(fallMBB);
28077 // Reload the previously saved SSP register value.
28078 unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
28079 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28080 const int64_t SPPOffset = 3 * PVT.getStoreSize();
28081 MachineInstrBuilder MIB =
28082 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
28083 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28084 if (i == X86::AddrDisp)
28085 MIB.addDisp(MI.getOperand(i), SPPOffset);
28087 MIB.add(MI.getOperand(i));
28089 MIB.setMemRefs(MMOBegin, MMOEnd);
28091 // Subtract the current SSP from the previous SSP.
28092 unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
28093 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
28094 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
28095 .addReg(PrevSSPReg)
28096 .addReg(SSPCopyReg);
28098 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
28099 BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
28100 fallMBB->addSuccessor(sinkMBB);
28101 fallMBB->addSuccessor(fixShadowMBB);
28103 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
28104 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
28105 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
28106 unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
28107 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
28111 // Increase SSP when looking only on the lower 8 bits of the delta.
28112 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
28113 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
28115 // Reset the lower 8 bits.
28116 unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
28117 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
28118 .addReg(SspFirstShrReg)
28121 // Jump if the result of the shift is zero.
28122 BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28123 fixShadowMBB->addSuccessor(sinkMBB);
28124 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
28126 // Do a single shift left.
28127 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
28128 unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
28129 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
28130 .addReg(SspSecondShrReg);
28132 // Save the value 128 to a register (will be used next with incssp).
28133 unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
28134 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
28135 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
28137 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
28139 // Since incssp only looks at the lower 8 bits, we might need to do several
28140 // iterations of incssp until we finish fixing the shadow stack.
28141 unsigned DecReg = MRI.createVirtualRegister(PtrRC);
28142 unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
28143 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
28144 .addReg(SspAfterShlReg)
28145 .addMBB(fixShadowLoopPrepareMBB)
28147 .addMBB(fixShadowLoopMBB);
28149 // Every iteration we increase the SSP by 128.
28150 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
28152 // Every iteration we decrement the counter by 1.
28153 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
28154 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
28156 // Jump if the counter is not zero yet.
28157 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
28158 fixShadowLoopMBB->addSuccessor(sinkMBB);
28159 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
28164 MachineBasicBlock *
28165 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
28166 MachineBasicBlock *MBB) const {
28167 DebugLoc DL = MI.getDebugLoc();
28168 MachineFunction *MF = MBB->getParent();
28169 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28170 MachineRegisterInfo &MRI = MF->getRegInfo();
28172 // Memory Reference
28173 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
28174 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
28176 MVT PVT = getPointerTy(MF->getDataLayout());
28177 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
28178 "Invalid Pointer Size!");
28180 const TargetRegisterClass *RC =
28181 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28182 unsigned Tmp = MRI.createVirtualRegister(RC);
28183 // Since FP is only updated here but NOT referenced, it's treated as GPR.
28184 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28185 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
28186 unsigned SP = RegInfo->getStackRegister();
28188 MachineInstrBuilder MIB;
28190 const int64_t LabelOffset = 1 * PVT.getStoreSize();
28191 const int64_t SPOffset = 2 * PVT.getStoreSize();
28193 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28194 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
28196 MachineBasicBlock *thisMBB = MBB;
28198 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
28199 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
28200 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
28204 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
28205 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
28206 MIB.add(MI.getOperand(i));
28207 MIB.setMemRefs(MMOBegin, MMOEnd);
28210 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
28211 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28212 if (i == X86::AddrDisp)
28213 MIB.addDisp(MI.getOperand(i), LabelOffset);
28215 MIB.add(MI.getOperand(i));
28217 MIB.setMemRefs(MMOBegin, MMOEnd);
28220 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
28221 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28222 if (i == X86::AddrDisp)
28223 MIB.addDisp(MI.getOperand(i), SPOffset);
28225 MIB.add(MI.getOperand(i));
28227 MIB.setMemRefs(MMOBegin, MMOEnd);
28230 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
28232 MI.eraseFromParent();
28236 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
28237 MachineBasicBlock *MBB,
28238 MachineBasicBlock *DispatchBB,
28240 DebugLoc DL = MI.getDebugLoc();
28241 MachineFunction *MF = MBB->getParent();
28242 MachineRegisterInfo *MRI = &MF->getRegInfo();
28243 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28245 MVT PVT = getPointerTy(MF->getDataLayout());
28246 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
28251 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
28252 !isPositionIndependent();
28255 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
28257 const TargetRegisterClass *TRC =
28258 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28259 VR = MRI->createVirtualRegister(TRC);
28260 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
28262 if (Subtarget.is64Bit())
28263 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
28267 .addMBB(DispatchBB)
28270 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
28271 .addReg(0) /* TII->getGlobalBaseReg(MF) */
28274 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
28278 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
28279 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
28281 MIB.addMBB(DispatchBB);
28286 MachineBasicBlock *
28287 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
28288 MachineBasicBlock *BB) const {
28289 DebugLoc DL = MI.getDebugLoc();
28290 MachineFunction *MF = BB->getParent();
28291 MachineFrameInfo &MFI = MF->getFrameInfo();
28292 MachineRegisterInfo *MRI = &MF->getRegInfo();
28293 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28294 int FI = MFI.getFunctionContextIndex();
28296 // Get a mapping of the call site numbers to all of the landing pads they're
28297 // associated with.
28298 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
28299 unsigned MaxCSNum = 0;
28300 for (auto &MBB : *MF) {
28301 if (!MBB.isEHPad())
28304 MCSymbol *Sym = nullptr;
28305 for (const auto &MI : MBB) {
28306 if (MI.isDebugInstr())
28309 assert(MI.isEHLabel() && "expected EH_LABEL");
28310 Sym = MI.getOperand(0).getMCSymbol();
28314 if (!MF->hasCallSiteLandingPad(Sym))
28317 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
28318 CallSiteNumToLPad[CSI].push_back(&MBB);
28319 MaxCSNum = std::max(MaxCSNum, CSI);
28323 // Get an ordered list of the machine basic blocks for the jump table.
28324 std::vector<MachineBasicBlock *> LPadList;
28325 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
28326 LPadList.reserve(CallSiteNumToLPad.size());
28328 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
28329 for (auto &LP : CallSiteNumToLPad[CSI]) {
28330 LPadList.push_back(LP);
28331 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
28335 assert(!LPadList.empty() &&
28336 "No landing pad destinations for the dispatch jump table!");
28338 // Create the MBBs for the dispatch code.
28340 // Shove the dispatch's address into the return slot in the function context.
28341 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
28342 DispatchBB->setIsEHPad(true);
28344 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
28345 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
28346 DispatchBB->addSuccessor(TrapBB);
28348 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
28349 DispatchBB->addSuccessor(DispContBB);
28352 MF->push_back(DispatchBB);
28353 MF->push_back(DispContBB);
28354 MF->push_back(TrapBB);
28356 // Insert code into the entry block that creates and registers the function
28358 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
28360 // Create the jump table and associated information
28361 unsigned JTE = getJumpTableEncoding();
28362 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
28363 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
28365 const X86RegisterInfo &RI = TII->getRegisterInfo();
28366 // Add a register mask with no preserved registers. This results in all
28367 // registers being marked as clobbered.
28368 if (RI.hasBasePointer(*MF)) {
28369 const bool FPIs64Bit =
28370 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
28371 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
28372 MFI->setRestoreBasePointer(MF);
28374 unsigned FP = RI.getFrameRegister(*MF);
28375 unsigned BP = RI.getBaseRegister();
28376 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
28377 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
28378 MFI->getRestoreBasePointerOffset())
28379 .addRegMask(RI.getNoPreservedMask());
28381 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
28382 .addRegMask(RI.getNoPreservedMask());
28385 // IReg is used as an index in a memory operand and therefore can't be SP
28386 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
28387 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
28388 Subtarget.is64Bit() ? 8 : 4);
28389 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
28391 .addImm(LPadList.size());
28392 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
28394 if (Subtarget.is64Bit()) {
28395 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28396 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
28398 // leaq .LJTI0_0(%rip), BReg
28399 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
28403 .addJumpTableIndex(MJTI)
28405 // movzx IReg64, IReg
28406 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
28409 .addImm(X86::sub_32bit);
28412 case MachineJumpTableInfo::EK_BlockAddress:
28413 // jmpq *(BReg,IReg64,8)
28414 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
28421 case MachineJumpTableInfo::EK_LabelDifference32: {
28422 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
28423 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
28424 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28426 // movl (BReg,IReg64,4), OReg
28427 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
28433 // movsx OReg64, OReg
28434 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
28435 // addq BReg, OReg64, TReg
28436 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
28440 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
28444 llvm_unreachable("Unexpected jump table encoding");
28447 // jmpl *.LJTI0_0(,IReg,4)
28448 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
28452 .addJumpTableIndex(MJTI)
28456 // Add the jump table entries as successors to the MBB.
28457 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
28458 for (auto &LP : LPadList)
28459 if (SeenMBBs.insert(LP).second)
28460 DispContBB->addSuccessor(LP);
28462 // N.B. the order the invoke BBs are processed in doesn't matter here.
28463 SmallVector<MachineBasicBlock *, 64> MBBLPads;
28464 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
28465 for (MachineBasicBlock *MBB : InvokeBBs) {
28466 // Remove the landing pad successor from the invoke block and replace it
28467 // with the new dispatch block.
28468 // Keep a copy of Successors since it's modified inside the loop.
28469 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
28471 // FIXME: Avoid quadratic complexity.
28472 for (auto MBBS : Successors) {
28473 if (MBBS->isEHPad()) {
28474 MBB->removeSuccessor(MBBS);
28475 MBBLPads.push_back(MBBS);
28479 MBB->addSuccessor(DispatchBB);
28481 // Find the invoke call and mark all of the callee-saved registers as
28482 // 'implicit defined' so that they're spilled. This prevents code from
28483 // moving instructions to before the EH block, where they will never be
28485 for (auto &II : reverse(*MBB)) {
28489 DenseMap<unsigned, bool> DefRegs;
28490 for (auto &MOp : II.operands())
28492 DefRegs[MOp.getReg()] = true;
28494 MachineInstrBuilder MIB(*MF, &II);
28495 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
28496 unsigned Reg = SavedRegs[RI];
28498 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
28505 // Mark all former landing pads as non-landing pads. The dispatch is the only
28506 // landing pad now.
28507 for (auto &LP : MBBLPads)
28508 LP->setIsEHPad(false);
28510 // The instruction is gone now.
28511 MI.eraseFromParent();
28515 MachineBasicBlock *
28516 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
28517 MachineBasicBlock *BB) const {
28518 MachineFunction *MF = BB->getParent();
28519 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28520 DebugLoc DL = MI.getDebugLoc();
28522 switch (MI.getOpcode()) {
28523 default: llvm_unreachable("Unexpected instr type to insert");
28524 case X86::TLS_addr32:
28525 case X86::TLS_addr64:
28526 case X86::TLS_base_addr32:
28527 case X86::TLS_base_addr64:
28528 return EmitLoweredTLSAddr(MI, BB);
28529 case X86::RETPOLINE_CALL32:
28530 case X86::RETPOLINE_CALL64:
28531 case X86::RETPOLINE_TCRETURN32:
28532 case X86::RETPOLINE_TCRETURN64:
28533 return EmitLoweredRetpoline(MI, BB);
28534 case X86::CATCHRET:
28535 return EmitLoweredCatchRet(MI, BB);
28536 case X86::CATCHPAD:
28537 return EmitLoweredCatchPad(MI, BB);
28538 case X86::SEG_ALLOCA_32:
28539 case X86::SEG_ALLOCA_64:
28540 return EmitLoweredSegAlloca(MI, BB);
28541 case X86::TLSCall_32:
28542 case X86::TLSCall_64:
28543 return EmitLoweredTLSCall(MI, BB);
28544 case X86::CMOV_FR32:
28545 case X86::CMOV_FR64:
28546 case X86::CMOV_F128:
28547 case X86::CMOV_GR8:
28548 case X86::CMOV_GR16:
28549 case X86::CMOV_GR32:
28550 case X86::CMOV_RFP32:
28551 case X86::CMOV_RFP64:
28552 case X86::CMOV_RFP80:
28553 case X86::CMOV_V2F64:
28554 case X86::CMOV_V2I64:
28555 case X86::CMOV_V4F32:
28556 case X86::CMOV_V4F64:
28557 case X86::CMOV_V4I64:
28558 case X86::CMOV_V16F32:
28559 case X86::CMOV_V8F32:
28560 case X86::CMOV_V8F64:
28561 case X86::CMOV_V8I64:
28562 case X86::CMOV_V8I1:
28563 case X86::CMOV_V16I1:
28564 case X86::CMOV_V32I1:
28565 case X86::CMOV_V64I1:
28566 return EmitLoweredSelect(MI, BB);
28568 case X86::RDFLAGS32:
28569 case X86::RDFLAGS64: {
28571 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
28572 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
28573 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
28574 // Permit reads of the EFLAGS and DF registers without them being defined.
28575 // This intrinsic exists to read external processor state in flags, such as
28576 // the trap flag, interrupt flag, and direction flag, none of which are
28577 // modeled by the backend.
28578 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
28579 "Unexpected register in operand!");
28580 Push->getOperand(2).setIsUndef();
28581 assert(Push->getOperand(3).getReg() == X86::DF &&
28582 "Unexpected register in operand!");
28583 Push->getOperand(3).setIsUndef();
28584 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
28586 MI.eraseFromParent(); // The pseudo is gone now.
28590 case X86::WRFLAGS32:
28591 case X86::WRFLAGS64: {
28593 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
28595 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
28596 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
28597 BuildMI(*BB, MI, DL, TII->get(PopF));
28599 MI.eraseFromParent(); // The pseudo is gone now.
28603 case X86::RELEASE_FADD32mr:
28604 case X86::RELEASE_FADD64mr:
28605 return EmitLoweredAtomicFP(MI, BB);
28607 case X86::FP32_TO_INT16_IN_MEM:
28608 case X86::FP32_TO_INT32_IN_MEM:
28609 case X86::FP32_TO_INT64_IN_MEM:
28610 case X86::FP64_TO_INT16_IN_MEM:
28611 case X86::FP64_TO_INT32_IN_MEM:
28612 case X86::FP64_TO_INT64_IN_MEM:
28613 case X86::FP80_TO_INT16_IN_MEM:
28614 case X86::FP80_TO_INT32_IN_MEM:
28615 case X86::FP80_TO_INT64_IN_MEM: {
28616 // Change the floating point control register to use "round towards zero"
28617 // mode when truncating to an integer value.
28618 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
28619 addFrameReference(BuildMI(*BB, MI, DL,
28620 TII->get(X86::FNSTCW16m)), CWFrameIdx);
28622 // Load the old value of the high byte of the control word...
28624 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
28625 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
28628 // Set the high part to be round to zero...
28629 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
28632 // Reload the modified control word now...
28633 addFrameReference(BuildMI(*BB, MI, DL,
28634 TII->get(X86::FLDCW16m)), CWFrameIdx);
28636 // Restore the memory image of control word to original value
28637 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
28640 // Get the X86 opcode to use.
28642 switch (MI.getOpcode()) {
28643 default: llvm_unreachable("illegal opcode!");
28644 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
28645 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
28646 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
28647 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
28648 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
28649 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
28650 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
28651 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
28652 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
28655 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28656 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
28657 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
28659 // Reload the original control word now.
28660 addFrameReference(BuildMI(*BB, MI, DL,
28661 TII->get(X86::FLDCW16m)), CWFrameIdx);
28663 MI.eraseFromParent(); // The pseudo instruction is gone now.
28666 // Thread synchronization.
28668 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
28669 case X86::MONITORX:
28670 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
28674 return emitClzero(&MI, BB, Subtarget);
28678 return emitWRPKRU(MI, BB, Subtarget);
28680 return emitRDPKRU(MI, BB, Subtarget);
28683 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
28685 case X86::VASTART_SAVE_XMM_REGS:
28686 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
28688 case X86::VAARG_64:
28689 return EmitVAARG64WithCustomInserter(MI, BB);
28691 case X86::EH_SjLj_SetJmp32:
28692 case X86::EH_SjLj_SetJmp64:
28693 return emitEHSjLjSetJmp(MI, BB);
28695 case X86::EH_SjLj_LongJmp32:
28696 case X86::EH_SjLj_LongJmp64:
28697 return emitEHSjLjLongJmp(MI, BB);
28699 case X86::Int_eh_sjlj_setup_dispatch:
28700 return EmitSjLjDispatchBlock(MI, BB);
28702 case TargetOpcode::STATEPOINT:
28703 // As an implementation detail, STATEPOINT shares the STACKMAP format at
28704 // this point in the process. We diverge later.
28705 return emitPatchPoint(MI, BB);
28707 case TargetOpcode::STACKMAP:
28708 case TargetOpcode::PATCHPOINT:
28709 return emitPatchPoint(MI, BB);
28711 case TargetOpcode::PATCHABLE_EVENT_CALL:
28712 return emitXRayCustomEvent(MI, BB);
28714 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
28715 return emitXRayTypedEvent(MI, BB);
28717 case X86::LCMPXCHG8B: {
28718 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
28719 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
28720 // requires a memory operand. If it happens that current architecture is
28721 // i686 and for current function we need a base pointer
28722 // - which is ESI for i686 - register allocator would not be able to
28723 // allocate registers for an address in form of X(%reg, %reg, Y)
28724 // - there never would be enough unreserved registers during regalloc
28725 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
28726 // We are giving a hand to register allocator by precomputing the address in
28727 // a new vreg using LEA.
28729 // If it is not i686 or there is no base pointer - nothing to do here.
28730 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
28733 // Even though this code does not necessarily needs the base pointer to
28734 // be ESI, we check for that. The reason: if this assert fails, there are
28735 // some changes happened in the compiler base pointer handling, which most
28736 // probably have to be addressed somehow here.
28737 assert(TRI->getBaseRegister() == X86::ESI &&
28738 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
28739 "base pointer in mind");
28741 MachineRegisterInfo &MRI = MF->getRegInfo();
28742 MVT SPTy = getPointerTy(MF->getDataLayout());
28743 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
28744 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
28746 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28747 // Regalloc does not need any help when the memory operand of CMPXCHG8B
28748 // does not use index register.
28749 if (AM.IndexReg == X86::NoRegister)
28752 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
28753 // four operand definitions that are E[ABCD] registers. We skip them and
28754 // then insert the LEA.
28755 MachineBasicBlock::iterator MBBI(MI);
28756 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
28757 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
28760 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
28762 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
28766 case X86::LCMPXCHG16B:
28768 case X86::LCMPXCHG8B_SAVE_EBX:
28769 case X86::LCMPXCHG16B_SAVE_RBX: {
28771 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
28772 if (!BB->isLiveIn(BasePtr))
28773 BB->addLiveIn(BasePtr);
28779 //===----------------------------------------------------------------------===//
28780 // X86 Optimization Hooks
28781 //===----------------------------------------------------------------------===//
28784 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
28785 const APInt &Demanded,
28786 TargetLoweringOpt &TLO) const {
28787 // Only optimize Ands to prevent shrinking a constant that could be
28788 // matched by movzx.
28789 if (Op.getOpcode() != ISD::AND)
28792 EVT VT = Op.getValueType();
28798 unsigned Size = VT.getSizeInBits();
28800 // Make sure the RHS really is a constant.
28801 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
28805 const APInt &Mask = C->getAPIntValue();
28807 // Clear all non-demanded bits initially.
28808 APInt ShrunkMask = Mask & Demanded;
28810 // Find the width of the shrunk mask.
28811 unsigned Width = ShrunkMask.getActiveBits();
28813 // If the mask is all 0s there's nothing to do here.
28817 // Find the next power of 2 width, rounding up to a byte.
28818 Width = PowerOf2Ceil(std::max(Width, 8U));
28819 // Truncate the width to size to handle illegal types.
28820 Width = std::min(Width, Size);
28822 // Calculate a possible zero extend mask for this constant.
28823 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
28825 // If we aren't changing the mask, just return true to keep it and prevent
28826 // the caller from optimizing.
28827 if (ZeroExtendMask == Mask)
28830 // Make sure the new mask can be represented by a combination of mask bits
28831 // and non-demanded bits.
28832 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
28835 // Replace the constant with the zero extend mask.
28837 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
28838 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
28839 return TLO.CombineTo(Op, NewOp);
28842 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
28844 const APInt &DemandedElts,
28845 const SelectionDAG &DAG,
28846 unsigned Depth) const {
28847 unsigned BitWidth = Known.getBitWidth();
28848 unsigned Opc = Op.getOpcode();
28849 EVT VT = Op.getValueType();
28850 assert((Opc >= ISD::BUILTIN_OP_END ||
28851 Opc == ISD::INTRINSIC_WO_CHAIN ||
28852 Opc == ISD::INTRINSIC_W_CHAIN ||
28853 Opc == ISD::INTRINSIC_VOID) &&
28854 "Should use MaskedValueIsZero if you don't know whether Op"
28855 " is a target node!");
28860 case X86ISD::SETCC:
28861 Known.Zero.setBitsFrom(1);
28863 case X86ISD::MOVMSK: {
28864 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
28865 Known.Zero.setBitsFrom(NumLoBits);
28868 case X86ISD::PEXTRB:
28869 case X86ISD::PEXTRW: {
28870 SDValue Src = Op.getOperand(0);
28871 EVT SrcVT = Src.getValueType();
28872 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
28873 Op.getConstantOperandVal(1));
28874 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
28875 Known = Known.zextOrTrunc(BitWidth);
28876 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
28879 case X86ISD::VSHLI:
28880 case X86ISD::VSRLI: {
28881 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
28882 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
28883 Known.setAllZero();
28887 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
28888 unsigned ShAmt = ShiftImm->getZExtValue();
28889 if (Opc == X86ISD::VSHLI) {
28890 Known.Zero <<= ShAmt;
28891 Known.One <<= ShAmt;
28892 // Low bits are known zero.
28893 Known.Zero.setLowBits(ShAmt);
28895 Known.Zero.lshrInPlace(ShAmt);
28896 Known.One.lshrInPlace(ShAmt);
28897 // High bits are known zero.
28898 Known.Zero.setHighBits(ShAmt);
28903 case X86ISD::PACKUS: {
28904 // PACKUS is just a truncation if the upper half is zero.
28905 // TODO: Add DemandedElts support.
28907 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
28908 DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
28909 Known.One &= Known2.One;
28910 Known.Zero &= Known2.Zero;
28911 if (Known.countMinLeadingZeros() < BitWidth)
28913 Known = Known.trunc(BitWidth);
28916 case X86ISD::VZEXT: {
28917 // TODO: Add DemandedElts support.
28918 SDValue N0 = Op.getOperand(0);
28919 unsigned NumElts = VT.getVectorNumElements();
28921 EVT SrcVT = N0.getValueType();
28922 unsigned InNumElts = SrcVT.getVectorNumElements();
28923 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
28924 assert(InNumElts >= NumElts && "Illegal VZEXT input");
28926 Known = KnownBits(InBitWidth);
28927 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
28928 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
28929 Known = Known.zext(BitWidth);
28930 Known.Zero.setBitsFrom(InBitWidth);
28933 case X86ISD::CMOV: {
28934 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
28935 // If we don't know any bits, early out.
28936 if (Known.isUnknown())
28939 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
28941 // Only known if known in both the LHS and RHS.
28942 Known.One &= Known2.One;
28943 Known.Zero &= Known2.Zero;
28946 case X86ISD::UDIVREM8_ZEXT_HREG:
28947 // TODO: Support more than just the zero extended bits?
28948 if (Op.getResNo() != 1)
28950 // The remainder is zero extended.
28951 Known.Zero.setBitsFrom(8);
28955 // Handle target shuffles.
28956 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
28957 if (isTargetShuffle(Opc)) {
28959 SmallVector<int, 64> Mask;
28960 SmallVector<SDValue, 2> Ops;
28961 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
28963 unsigned NumOps = Ops.size();
28964 unsigned NumElts = VT.getVectorNumElements();
28965 if (Mask.size() == NumElts) {
28966 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
28967 Known.Zero.setAllBits(); Known.One.setAllBits();
28968 for (unsigned i = 0; i != NumElts; ++i) {
28969 if (!DemandedElts[i])
28972 if (M == SM_SentinelUndef) {
28973 // For UNDEF elements, we don't know anything about the common state
28974 // of the shuffle result.
28977 } else if (M == SM_SentinelZero) {
28978 Known.One.clearAllBits();
28981 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
28982 "Shuffle index out of range");
28984 unsigned OpIdx = (unsigned)M / NumElts;
28985 unsigned EltIdx = (unsigned)M % NumElts;
28986 if (Ops[OpIdx].getValueType() != VT) {
28987 // TODO - handle target shuffle ops with different value types.
28991 DemandedOps[OpIdx].setBit(EltIdx);
28993 // Known bits are the values that are shared by every demanded element.
28994 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
28995 if (!DemandedOps[i])
28998 DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1);
28999 Known.One &= Known2.One;
29000 Known.Zero &= Known2.Zero;
29007 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
29008 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
29009 unsigned Depth) const {
29010 unsigned VTBits = Op.getScalarValueSizeInBits();
29011 unsigned Opcode = Op.getOpcode();
29013 case X86ISD::SETCC_CARRY:
29014 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
29017 case X86ISD::VSEXT: {
29018 // TODO: Add DemandedElts support.
29019 SDValue Src = Op.getOperand(0);
29020 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
29021 Tmp += VTBits - Src.getScalarValueSizeInBits();
29025 case X86ISD::VTRUNC: {
29026 // TODO: Add DemandedElts support.
29027 SDValue Src = Op.getOperand(0);
29028 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
29029 assert(VTBits < NumSrcBits && "Illegal truncation input type");
29030 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
29031 if (Tmp > (NumSrcBits - VTBits))
29032 return Tmp - (NumSrcBits - VTBits);
29036 case X86ISD::PACKSS: {
29037 // PACKSS is just a truncation if the sign bits extend to the packed size.
29038 // TODO: Add DemandedElts support.
29039 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
29040 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
29041 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
29042 unsigned Tmp = std::min(Tmp0, Tmp1);
29043 if (Tmp > (SrcBits - VTBits))
29044 return Tmp - (SrcBits - VTBits);
29048 case X86ISD::VSHLI: {
29049 SDValue Src = Op.getOperand(0);
29050 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
29051 if (ShiftVal.uge(VTBits))
29052 return VTBits; // Shifted all bits out --> zero.
29053 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
29054 if (ShiftVal.uge(Tmp))
29055 return 1; // Shifted all sign bits out --> unknown.
29056 return Tmp - ShiftVal.getZExtValue();
29059 case X86ISD::VSRAI: {
29060 SDValue Src = Op.getOperand(0);
29061 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
29062 if (ShiftVal.uge(VTBits - 1))
29063 return VTBits; // Sign splat.
29064 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
29066 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
29069 case X86ISD::PCMPGT:
29070 case X86ISD::PCMPEQ:
29072 case X86ISD::VPCOM:
29073 case X86ISD::VPCOMU:
29074 // Vector compares return zero/all-bits result values.
29077 case X86ISD::CMOV: {
29078 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
29079 if (Tmp0 == 1) return 1; // Early out.
29080 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
29081 return std::min(Tmp0, Tmp1);
29083 case X86ISD::SDIVREM8_SEXT_HREG:
29084 // TODO: Support more than just the sign extended bits?
29085 if (Op.getResNo() != 1)
29087 // The remainder is sign extended.
29095 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
29096 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
29097 return N->getOperand(0);
29101 /// Returns true (and the GlobalValue and the offset) if the node is a
29102 /// GlobalAddress + offset.
29103 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
29104 const GlobalValue* &GA,
29105 int64_t &Offset) const {
29106 if (N->getOpcode() == X86ISD::Wrapper) {
29107 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
29108 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
29109 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
29113 return TargetLowering::isGAPlusOffset(N, GA, Offset);
29116 // Attempt to match a combined shuffle mask against supported unary shuffle
29118 // TODO: Investigate sharing more of this with shuffle lowering.
29119 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29120 bool AllowFloatDomain, bool AllowIntDomain,
29121 SDValue &V1, const SDLoc &DL,
29123 const X86Subtarget &Subtarget,
29124 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
29125 unsigned NumMaskElts = Mask.size();
29126 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
29128 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
29129 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
29130 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
29131 Shuffle = X86ISD::VZEXT_MOVL;
29132 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29136 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
29137 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
29138 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
29139 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
29140 unsigned MaxScale = 64 / MaskEltSize;
29141 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
29143 unsigned NumDstElts = NumMaskElts / Scale;
29144 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
29145 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
29146 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
29149 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
29150 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
29151 MVT::getIntegerVT(MaskEltSize);
29152 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
29154 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
29155 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
29156 Shuffle = unsigned(X86ISD::VZEXT);
29158 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
29160 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
29161 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
29167 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
29168 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
29169 isUndefOrEqual(Mask[0], 0) &&
29170 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
29171 Shuffle = X86ISD::VZEXT_MOVL;
29172 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29176 // Check if we have SSE3 which will let us use MOVDDUP etc. The
29177 // instructions are no slower than UNPCKLPD but has the option to
29178 // fold the input operand into even an unaligned memory load.
29179 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
29180 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
29181 Shuffle = X86ISD::MOVDDUP;
29182 SrcVT = DstVT = MVT::v2f64;
29185 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29186 Shuffle = X86ISD::MOVSLDUP;
29187 SrcVT = DstVT = MVT::v4f32;
29190 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
29191 Shuffle = X86ISD::MOVSHDUP;
29192 SrcVT = DstVT = MVT::v4f32;
29197 if (MaskVT.is256BitVector() && AllowFloatDomain) {
29198 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
29199 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29200 Shuffle = X86ISD::MOVDDUP;
29201 SrcVT = DstVT = MVT::v4f64;
29204 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29205 Shuffle = X86ISD::MOVSLDUP;
29206 SrcVT = DstVT = MVT::v8f32;
29209 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
29210 Shuffle = X86ISD::MOVSHDUP;
29211 SrcVT = DstVT = MVT::v8f32;
29216 if (MaskVT.is512BitVector() && AllowFloatDomain) {
29217 assert(Subtarget.hasAVX512() &&
29218 "AVX512 required for 512-bit vector shuffles");
29219 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29220 Shuffle = X86ISD::MOVDDUP;
29221 SrcVT = DstVT = MVT::v8f64;
29224 if (isTargetShuffleEquivalent(
29225 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
29226 Shuffle = X86ISD::MOVSLDUP;
29227 SrcVT = DstVT = MVT::v16f32;
29230 if (isTargetShuffleEquivalent(
29231 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
29232 Shuffle = X86ISD::MOVSHDUP;
29233 SrcVT = DstVT = MVT::v16f32;
29238 // Attempt to match against broadcast-from-vector.
29239 if (Subtarget.hasAVX2()) {
29240 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
29241 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
29242 SrcVT = DstVT = MaskVT;
29243 Shuffle = X86ISD::VBROADCAST;
29251 // Attempt to match a combined shuffle mask against supported unary immediate
29252 // permute instructions.
29253 // TODO: Investigate sharing more of this with shuffle lowering.
29254 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29255 const APInt &Zeroable,
29256 bool AllowFloatDomain,
29257 bool AllowIntDomain,
29258 const X86Subtarget &Subtarget,
29259 unsigned &Shuffle, MVT &ShuffleVT,
29260 unsigned &PermuteImm) {
29261 unsigned NumMaskElts = Mask.size();
29262 unsigned InputSizeInBits = MaskVT.getSizeInBits();
29263 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
29264 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
29266 bool ContainsZeros =
29267 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29269 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
29270 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
29271 // Check for lane crossing permutes.
29272 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
29273 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
29274 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
29275 Shuffle = X86ISD::VPERMI;
29276 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
29277 PermuteImm = getV4X86ShuffleImm(Mask);
29280 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
29281 SmallVector<int, 4> RepeatedMask;
29282 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
29283 Shuffle = X86ISD::VPERMI;
29284 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
29285 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
29289 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
29290 // VPERMILPD can permute with a non-repeating shuffle.
29291 Shuffle = X86ISD::VPERMILPI;
29292 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
29294 for (int i = 0, e = Mask.size(); i != e; ++i) {
29296 if (M == SM_SentinelUndef)
29298 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
29299 PermuteImm |= (M & 1) << i;
29305 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
29306 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
29307 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
29308 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
29309 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
29310 SmallVector<int, 4> RepeatedMask;
29311 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29312 // Narrow the repeated mask to create 32-bit element permutes.
29313 SmallVector<int, 4> WordMask = RepeatedMask;
29314 if (MaskScalarSizeInBits == 64)
29315 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
29317 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
29318 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
29319 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
29320 PermuteImm = getV4X86ShuffleImm(WordMask);
29325 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
29326 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
29327 SmallVector<int, 4> RepeatedMask;
29328 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29329 ArrayRef<int> LoMask(Mask.data() + 0, 4);
29330 ArrayRef<int> HiMask(Mask.data() + 4, 4);
29332 // PSHUFLW: permute lower 4 elements only.
29333 if (isUndefOrInRange(LoMask, 0, 4) &&
29334 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
29335 Shuffle = X86ISD::PSHUFLW;
29336 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29337 PermuteImm = getV4X86ShuffleImm(LoMask);
29341 // PSHUFHW: permute upper 4 elements only.
29342 if (isUndefOrInRange(HiMask, 4, 8) &&
29343 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
29344 // Offset the HiMask so that we can create the shuffle immediate.
29345 int OffsetHiMask[4];
29346 for (int i = 0; i != 4; ++i)
29347 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
29349 Shuffle = X86ISD::PSHUFHW;
29350 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29351 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
29357 // Attempt to match against byte/bit shifts.
29358 // FIXME: Add 512-bit support.
29359 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29360 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29361 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
29362 MaskScalarSizeInBits, Mask,
29363 0, Zeroable, Subtarget);
29364 if (0 < ShiftAmt) {
29365 PermuteImm = (unsigned)ShiftAmt;
29373 // Attempt to match a combined unary shuffle mask against supported binary
29374 // shuffle instructions.
29375 // TODO: Investigate sharing more of this with shuffle lowering.
29376 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29377 bool AllowFloatDomain, bool AllowIntDomain,
29378 SDValue &V1, SDValue &V2, const SDLoc &DL,
29380 const X86Subtarget &Subtarget,
29381 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
29383 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29385 if (MaskVT.is128BitVector()) {
29386 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
29388 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
29389 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
29390 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
29393 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
29395 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
29396 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
29399 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
29400 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29402 Shuffle = X86ISD::MOVSD;
29403 SrcVT = DstVT = MVT::v2f64;
29406 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
29407 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29408 Shuffle = X86ISD::MOVSS;
29409 SrcVT = DstVT = MVT::v4f32;
29414 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
29415 // TODO add support for 256/512-bit types.
29416 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
29417 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
29424 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
29425 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
29426 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29427 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
29428 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
29429 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
29430 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
29432 SrcVT = DstVT = MaskVT;
29433 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
29434 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
29442 static bool matchBinaryPermuteVectorShuffle(
29443 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
29444 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
29445 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
29446 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
29447 unsigned NumMaskElts = Mask.size();
29448 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29450 // Attempt to match against PALIGNR byte rotate.
29451 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29452 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29453 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
29454 if (0 < ByteRotation) {
29455 Shuffle = X86ISD::PALIGNR;
29456 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
29457 PermuteImm = ByteRotation;
29462 // Attempt to combine to X86ISD::BLENDI.
29463 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
29464 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
29465 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
29466 uint64_t BlendMask = 0;
29467 bool ForceV1Zero = false, ForceV2Zero = false;
29468 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
29469 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
29471 if (MaskVT == MVT::v16i16) {
29472 // We can only use v16i16 PBLENDW if the lanes are repeated.
29473 SmallVector<int, 8> RepeatedMask;
29474 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
29476 assert(RepeatedMask.size() == 8 &&
29477 "Repeated mask size doesn't match!");
29479 for (int i = 0; i < 8; ++i)
29480 if (RepeatedMask[i] >= 8)
29481 PermuteImm |= 1 << i;
29482 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29483 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29484 Shuffle = X86ISD::BLENDI;
29485 ShuffleVT = MaskVT;
29489 // Determine a type compatible with X86ISD::BLENDI.
29490 ShuffleVT = MaskVT;
29491 if (Subtarget.hasAVX2()) {
29492 if (ShuffleVT == MVT::v4i64)
29493 ShuffleVT = MVT::v8i32;
29494 else if (ShuffleVT == MVT::v2i64)
29495 ShuffleVT = MVT::v4i32;
29497 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
29498 ShuffleVT = MVT::v8i16;
29499 else if (ShuffleVT == MVT::v4i64)
29500 ShuffleVT = MVT::v4f64;
29501 else if (ShuffleVT == MVT::v8i32)
29502 ShuffleVT = MVT::v8f32;
29505 if (!ShuffleVT.isFloatingPoint()) {
29506 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
29508 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
29509 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
29510 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
29513 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29514 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29515 PermuteImm = (unsigned)BlendMask;
29516 Shuffle = X86ISD::BLENDI;
29522 // Attempt to combine to INSERTPS.
29523 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
29524 MaskVT.is128BitVector()) {
29525 if (Zeroable.getBoolValue() &&
29526 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
29527 Shuffle = X86ISD::INSERTPS;
29528 ShuffleVT = MVT::v4f32;
29533 // Attempt to combine to SHUFPD.
29534 if (AllowFloatDomain && EltSizeInBits == 64 &&
29535 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29536 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29537 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29538 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
29539 Shuffle = X86ISD::SHUFP;
29540 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
29545 // Attempt to combine to SHUFPS.
29546 if (AllowFloatDomain && EltSizeInBits == 32 &&
29547 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
29548 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29549 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29550 SmallVector<int, 4> RepeatedMask;
29551 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
29552 // Match each half of the repeated mask, to determine if its just
29553 // referencing one of the vectors, is zeroable or entirely undef.
29554 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
29555 int M0 = RepeatedMask[Offset];
29556 int M1 = RepeatedMask[Offset + 1];
29558 if (isUndefInRange(RepeatedMask, Offset, 2)) {
29559 return DAG.getUNDEF(MaskVT);
29560 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
29561 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
29562 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
29563 return getZeroVector(MaskVT, Subtarget, DAG, DL);
29564 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
29565 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29566 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29568 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
29569 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29570 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29577 int ShufMask[4] = {-1, -1, -1, -1};
29578 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
29579 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
29584 Shuffle = X86ISD::SHUFP;
29585 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
29586 PermuteImm = getV4X86ShuffleImm(ShufMask);
29595 /// Combine an arbitrary chain of shuffles into a single instruction if
29598 /// This is the leaf of the recursive combine below. When we have found some
29599 /// chain of single-use x86 shuffle instructions and accumulated the combined
29600 /// shuffle mask represented by them, this will try to pattern match that mask
29601 /// into either a single instruction if there is a special purpose instruction
29602 /// for this operation, or into a PSHUFB instruction which is a fully general
29603 /// instruction but should only be used to replace chains over a certain depth.
29604 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
29605 ArrayRef<int> BaseMask, int Depth,
29606 bool HasVariableMask, SelectionDAG &DAG,
29607 const X86Subtarget &Subtarget) {
29608 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
29609 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
29610 "Unexpected number of shuffle inputs!");
29612 // Find the inputs that enter the chain. Note that multiple uses are OK
29613 // here, we're not going to remove the operands we find.
29614 bool UnaryShuffle = (Inputs.size() == 1);
29615 SDValue V1 = peekThroughBitcasts(Inputs[0]);
29616 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
29617 : peekThroughBitcasts(Inputs[1]));
29619 MVT VT1 = V1.getSimpleValueType();
29620 MVT VT2 = V2.getSimpleValueType();
29621 MVT RootVT = Root.getSimpleValueType();
29622 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
29623 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
29624 "Vector size mismatch");
29629 unsigned NumBaseMaskElts = BaseMask.size();
29630 if (NumBaseMaskElts == 1) {
29631 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
29632 return DAG.getBitcast(RootVT, V1);
29635 unsigned RootSizeInBits = RootVT.getSizeInBits();
29636 unsigned NumRootElts = RootVT.getVectorNumElements();
29637 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
29638 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
29639 (RootVT.isFloatingPoint() && Depth >= 2) ||
29640 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
29642 // Don't combine if we are a AVX512/EVEX target and the mask element size
29643 // is different from the root element size - this would prevent writemasks
29644 // from being reused.
29645 // TODO - this currently prevents all lane shuffles from occurring.
29646 // TODO - check for writemasks usage instead of always preventing combining.
29647 // TODO - attempt to narrow Mask back to writemask size.
29648 bool IsEVEXShuffle =
29649 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
29651 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
29653 // Handle 128-bit lane shuffles of 256-bit vectors.
29654 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
29655 // we need to use the zeroing feature.
29656 // TODO - this should support binary shuffles.
29657 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
29658 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
29659 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
29660 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
29661 return SDValue(); // Nothing to do!
29662 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
29663 unsigned PermMask = 0;
29664 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
29665 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
29667 Res = DAG.getBitcast(ShuffleVT, V1);
29668 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
29669 DAG.getUNDEF(ShuffleVT),
29670 DAG.getConstant(PermMask, DL, MVT::i8));
29671 return DAG.getBitcast(RootVT, Res);
29674 // For masks that have been widened to 128-bit elements or more,
29675 // narrow back down to 64-bit elements.
29676 SmallVector<int, 64> Mask;
29677 if (BaseMaskEltSizeInBits > 64) {
29678 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
29679 int MaskScale = BaseMaskEltSizeInBits / 64;
29680 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
29682 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
29685 unsigned NumMaskElts = Mask.size();
29686 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
29688 // Determine the effective mask value type.
29689 FloatDomain &= (32 <= MaskEltSizeInBits);
29690 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
29691 : MVT::getIntegerVT(MaskEltSizeInBits);
29692 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
29694 // Only allow legal mask types.
29695 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
29698 // Attempt to match the mask against known shuffle patterns.
29699 MVT ShuffleSrcVT, ShuffleVT;
29700 unsigned Shuffle, PermuteImm;
29702 // Which shuffle domains are permitted?
29703 // Permit domain crossing at higher combine depths.
29704 bool AllowFloatDomain = FloatDomain || (Depth > 3);
29705 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
29706 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
29708 // Determine zeroable mask elements.
29709 APInt Zeroable(NumMaskElts, 0);
29710 for (unsigned i = 0; i != NumMaskElts; ++i)
29711 if (isUndefOrZero(Mask[i]))
29712 Zeroable.setBit(i);
29714 if (UnaryShuffle) {
29715 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
29716 // directly if we don't shuffle the lower element and we shuffle the upper
29717 // (zero) elements within themselves.
29718 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
29719 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
29720 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
29721 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
29722 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
29723 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
29724 return DAG.getBitcast(RootVT, V1);
29728 SDValue NewV1 = V1; // Save operand in case early exit happens.
29729 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29730 NewV1, DL, DAG, Subtarget, Shuffle,
29731 ShuffleSrcVT, ShuffleVT) &&
29732 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29733 if (Depth == 1 && Root.getOpcode() == Shuffle)
29734 return SDValue(); // Nothing to do!
29735 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
29736 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
29737 return DAG.getBitcast(RootVT, Res);
29740 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
29741 AllowIntDomain, Subtarget, Shuffle,
29742 ShuffleVT, PermuteImm) &&
29743 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29744 if (Depth == 1 && Root.getOpcode() == Shuffle)
29745 return SDValue(); // Nothing to do!
29746 Res = DAG.getBitcast(ShuffleVT, V1);
29747 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
29748 DAG.getConstant(PermuteImm, DL, MVT::i8));
29749 return DAG.getBitcast(RootVT, Res);
29753 SDValue NewV1 = V1; // Save operands in case early exit happens.
29754 SDValue NewV2 = V2;
29755 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29756 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
29757 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
29758 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29759 if (Depth == 1 && Root.getOpcode() == Shuffle)
29760 return SDValue(); // Nothing to do!
29761 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
29762 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
29763 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
29764 return DAG.getBitcast(RootVT, Res);
29767 NewV1 = V1; // Save operands in case early exit happens.
29769 if (matchBinaryPermuteVectorShuffle(
29770 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
29771 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
29772 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29773 if (Depth == 1 && Root.getOpcode() == Shuffle)
29774 return SDValue(); // Nothing to do!
29775 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
29776 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
29777 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
29778 DAG.getConstant(PermuteImm, DL, MVT::i8));
29779 return DAG.getBitcast(RootVT, Res);
29782 // Typically from here on, we need an integer version of MaskVT.
29783 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
29784 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
29786 // Annoyingly, SSE4A instructions don't map into the above match helpers.
29787 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
29788 uint64_t BitLen, BitIdx;
29789 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
29791 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
29792 return SDValue(); // Nothing to do!
29793 V1 = DAG.getBitcast(IntMaskVT, V1);
29794 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
29795 DAG.getConstant(BitLen, DL, MVT::i8),
29796 DAG.getConstant(BitIdx, DL, MVT::i8));
29797 return DAG.getBitcast(RootVT, Res);
29800 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
29801 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
29802 return SDValue(); // Nothing to do!
29803 V1 = DAG.getBitcast(IntMaskVT, V1);
29804 V2 = DAG.getBitcast(IntMaskVT, V2);
29805 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
29806 DAG.getConstant(BitLen, DL, MVT::i8),
29807 DAG.getConstant(BitIdx, DL, MVT::i8));
29808 return DAG.getBitcast(RootVT, Res);
29812 // Don't try to re-form single instruction chains under any circumstances now
29813 // that we've done encoding canonicalization for them.
29817 // Depth threshold above which we can efficiently use variable mask shuffles.
29818 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
29819 bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
29821 bool MaskContainsZeros =
29822 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29824 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
29825 // If we have a single input lane-crossing shuffle then lower to VPERMV.
29826 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29827 ((Subtarget.hasAVX2() &&
29828 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29829 (Subtarget.hasAVX512() &&
29830 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29831 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29832 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29833 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29834 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29835 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29836 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29837 Res = DAG.getBitcast(MaskVT, V1);
29838 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
29839 return DAG.getBitcast(RootVT, Res);
29842 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
29843 // vector as the second source.
29844 if (UnaryShuffle && AllowVariableMask &&
29845 ((Subtarget.hasAVX512() &&
29846 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29847 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29848 (Subtarget.hasVLX() &&
29849 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29850 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29851 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29852 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29853 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29854 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29855 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
29856 for (unsigned i = 0; i != NumMaskElts; ++i)
29857 if (Mask[i] == SM_SentinelZero)
29858 Mask[i] = NumMaskElts + i;
29860 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29861 Res = DAG.getBitcast(MaskVT, V1);
29862 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
29863 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
29864 return DAG.getBitcast(RootVT, Res);
29867 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
29868 if (AllowVariableMask && !MaskContainsZeros &&
29869 ((Subtarget.hasAVX512() &&
29870 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29871 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29872 (Subtarget.hasVLX() &&
29873 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29874 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29875 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29876 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29877 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29878 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29879 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29880 V1 = DAG.getBitcast(MaskVT, V1);
29881 V2 = DAG.getBitcast(MaskVT, V2);
29882 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
29883 return DAG.getBitcast(RootVT, Res);
29888 // See if we can combine a single input shuffle with zeros to a bit-mask,
29889 // which is much simpler than any shuffle.
29890 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
29891 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
29892 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
29893 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
29894 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
29895 APInt UndefElts(NumMaskElts, 0);
29896 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
29897 for (unsigned i = 0; i != NumMaskElts; ++i) {
29899 if (M == SM_SentinelUndef) {
29900 UndefElts.setBit(i);
29903 if (M == SM_SentinelZero)
29905 EltBits[i] = AllOnes;
29907 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
29908 Res = DAG.getBitcast(MaskVT, V1);
29909 unsigned AndOpcode =
29910 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
29911 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
29912 return DAG.getBitcast(RootVT, Res);
29915 // If we have a single input shuffle with different shuffle patterns in the
29916 // the 128-bit lanes use the variable mask to VPERMILPS.
29917 // TODO Combine other mask types at higher depths.
29918 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29919 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
29920 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
29921 SmallVector<SDValue, 16> VPermIdx;
29922 for (int M : Mask) {
29924 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
29925 VPermIdx.push_back(Idx);
29927 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
29928 Res = DAG.getBitcast(MaskVT, V1);
29929 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
29930 return DAG.getBitcast(RootVT, Res);
29933 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
29934 // to VPERMIL2PD/VPERMIL2PS.
29935 if (AllowVariableMask && Subtarget.hasXOP() &&
29936 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
29937 MaskVT == MVT::v8f32)) {
29938 // VPERMIL2 Operation.
29939 // Bits[3] - Match Bit.
29940 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
29941 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
29942 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
29943 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
29944 SmallVector<int, 8> VPerm2Idx;
29945 unsigned M2ZImm = 0;
29946 for (int M : Mask) {
29947 if (M == SM_SentinelUndef) {
29948 VPerm2Idx.push_back(-1);
29951 if (M == SM_SentinelZero) {
29953 VPerm2Idx.push_back(8);
29956 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
29957 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
29958 VPerm2Idx.push_back(Index);
29960 V1 = DAG.getBitcast(MaskVT, V1);
29961 V2 = DAG.getBitcast(MaskVT, V2);
29962 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
29963 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
29964 DAG.getConstant(M2ZImm, DL, MVT::i8));
29965 return DAG.getBitcast(RootVT, Res);
29968 // If we have 3 or more shuffle instructions or a chain involving a variable
29969 // mask, we can replace them with a single PSHUFB instruction profitably.
29970 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29971 // instructions, but in practice PSHUFB tends to be *very* fast so we're
29972 // more aggressive.
29973 if (UnaryShuffle && AllowVariableMask &&
29974 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29975 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
29976 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
29977 SmallVector<SDValue, 16> PSHUFBMask;
29978 int NumBytes = RootVT.getSizeInBits() / 8;
29979 int Ratio = NumBytes / NumMaskElts;
29980 for (int i = 0; i < NumBytes; ++i) {
29981 int M = Mask[i / Ratio];
29982 if (M == SM_SentinelUndef) {
29983 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
29986 if (M == SM_SentinelZero) {
29987 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
29990 M = Ratio * M + i % Ratio;
29991 assert((M / 16) == (i / 16) && "Lane crossing detected");
29992 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29994 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
29995 Res = DAG.getBitcast(ByteVT, V1);
29996 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
29997 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
29998 return DAG.getBitcast(RootVT, Res);
30001 // With XOP, if we have a 128-bit binary input shuffle we can always combine
30002 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
30003 // slower than PSHUFB on targets that support both.
30004 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
30005 // VPPERM Mask Operation
30006 // Bits[4:0] - Byte Index (0 - 31)
30007 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
30008 SmallVector<SDValue, 16> VPPERMMask;
30010 int Ratio = NumBytes / NumMaskElts;
30011 for (int i = 0; i < NumBytes; ++i) {
30012 int M = Mask[i / Ratio];
30013 if (M == SM_SentinelUndef) {
30014 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
30017 if (M == SM_SentinelZero) {
30018 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
30021 M = Ratio * M + i % Ratio;
30022 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
30024 MVT ByteVT = MVT::v16i8;
30025 V1 = DAG.getBitcast(ByteVT, V1);
30026 V2 = DAG.getBitcast(ByteVT, V2);
30027 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
30028 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
30029 return DAG.getBitcast(RootVT, Res);
30032 // Failed to find any combines.
30036 // Attempt to constant fold all of the constant source ops.
30037 // Returns true if the entire shuffle is folded to a constant.
30038 // TODO: Extend this to merge multiple constant Ops and update the mask.
30039 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
30040 ArrayRef<int> Mask, SDValue Root,
30041 bool HasVariableMask,
30043 const X86Subtarget &Subtarget) {
30044 MVT VT = Root.getSimpleValueType();
30046 unsigned SizeInBits = VT.getSizeInBits();
30047 unsigned NumMaskElts = Mask.size();
30048 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
30049 unsigned NumOps = Ops.size();
30051 // Extract constant bits from each source op.
30052 bool OneUseConstantOp = false;
30053 SmallVector<APInt, 16> UndefEltsOps(NumOps);
30054 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
30055 for (unsigned i = 0; i != NumOps; ++i) {
30056 SDValue SrcOp = Ops[i];
30057 OneUseConstantOp |= SrcOp.hasOneUse();
30058 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
30063 // Only fold if at least one of the constants is only used once or
30064 // the combined shuffle has included a variable mask shuffle, this
30065 // is to avoid constant pool bloat.
30066 if (!OneUseConstantOp && !HasVariableMask)
30069 // Shuffle the constant bits according to the mask.
30070 APInt UndefElts(NumMaskElts, 0);
30071 APInt ZeroElts(NumMaskElts, 0);
30072 APInt ConstantElts(NumMaskElts, 0);
30073 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
30074 APInt::getNullValue(MaskSizeInBits));
30075 for (unsigned i = 0; i != NumMaskElts; ++i) {
30077 if (M == SM_SentinelUndef) {
30078 UndefElts.setBit(i);
30080 } else if (M == SM_SentinelZero) {
30081 ZeroElts.setBit(i);
30084 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
30086 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
30087 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
30089 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
30090 if (SrcUndefElts[SrcMaskIdx]) {
30091 UndefElts.setBit(i);
30095 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
30096 APInt &Bits = SrcEltBits[SrcMaskIdx];
30098 ZeroElts.setBit(i);
30102 ConstantElts.setBit(i);
30103 ConstantBitData[i] = Bits;
30105 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
30107 // Create the constant data.
30109 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
30110 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
30112 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
30114 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
30117 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
30118 return DAG.getBitcast(VT, CstOp);
30121 /// Fully generic combining of x86 shuffle instructions.
30123 /// This should be the last combine run over the x86 shuffle instructions. Once
30124 /// they have been fully optimized, this will recursively consider all chains
30125 /// of single-use shuffle instructions, build a generic model of the cumulative
30126 /// shuffle operation, and check for simpler instructions which implement this
30127 /// operation. We use this primarily for two purposes:
30129 /// 1) Collapse generic shuffles to specialized single instructions when
30130 /// equivalent. In most cases, this is just an encoding size win, but
30131 /// sometimes we will collapse multiple generic shuffles into a single
30132 /// special-purpose shuffle.
30133 /// 2) Look for sequences of shuffle instructions with 3 or more total
30134 /// instructions, and replace them with the slightly more expensive SSSE3
30135 /// PSHUFB instruction if available. We do this as the last combining step
30136 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
30137 /// a suitable short sequence of other instructions. The PSHUFB will either
30138 /// use a register or have to read from memory and so is slightly (but only
30139 /// slightly) more expensive than the other shuffle instructions.
30141 /// Because this is inherently a quadratic operation (for each shuffle in
30142 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
30143 /// This should never be an issue in practice as the shuffle lowering doesn't
30144 /// produce sequences of more than 8 instructions.
30146 /// FIXME: We will currently miss some cases where the redundant shuffling
30147 /// would simplify under the threshold for PSHUFB formation because of
30148 /// combine-ordering. To fix this, we should do the redundant instruction
30149 /// combining in this recursive walk.
30150 static SDValue combineX86ShufflesRecursively(
30151 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
30152 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
30153 bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) {
30154 // Bound the depth of our recursive combine because this is ultimately
30155 // quadratic in nature.
30156 const unsigned MaxRecursionDepth = 8;
30157 if (Depth > MaxRecursionDepth)
30160 // Directly rip through bitcasts to find the underlying operand.
30161 SDValue Op = SrcOps[SrcOpIndex];
30162 Op = peekThroughOneUseBitcasts(Op);
30164 MVT VT = Op.getSimpleValueType();
30165 if (!VT.isVector())
30166 return SDValue(); // Bail if we hit a non-vector.
30168 assert(Root.getSimpleValueType().isVector() &&
30169 "Shuffles operate on vector types!");
30170 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
30171 "Can only combine shuffles of the same vector register size.");
30173 // Extract target shuffle mask and resolve sentinels and inputs.
30174 SmallVector<int, 64> OpMask;
30175 SmallVector<SDValue, 2> OpInputs;
30176 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
30179 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
30180 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
30181 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
30183 // Add the inputs to the Ops list, avoiding duplicates.
30184 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
30186 int InputIdx0 = -1, InputIdx1 = -1;
30187 for (int i = 0, e = Ops.size(); i < e; ++i) {
30188 SDValue BC = peekThroughBitcasts(Ops[i]);
30189 if (Input0 && BC == peekThroughBitcasts(Input0))
30191 if (Input1 && BC == peekThroughBitcasts(Input1))
30195 if (Input0 && InputIdx0 < 0) {
30196 InputIdx0 = SrcOpIndex;
30197 Ops[SrcOpIndex] = Input0;
30199 if (Input1 && InputIdx1 < 0) {
30200 InputIdx1 = Ops.size();
30201 Ops.push_back(Input1);
30204 assert(((RootMask.size() > OpMask.size() &&
30205 RootMask.size() % OpMask.size() == 0) ||
30206 (OpMask.size() > RootMask.size() &&
30207 OpMask.size() % RootMask.size() == 0) ||
30208 OpMask.size() == RootMask.size()) &&
30209 "The smaller number of elements must divide the larger.");
30211 // This function can be performance-critical, so we rely on the power-of-2
30212 // knowledge that we have about the mask sizes to replace div/rem ops with
30213 // bit-masks and shifts.
30214 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
30215 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
30216 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
30217 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
30219 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
30220 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
30221 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
30222 assert((RootRatio == 1 || OpRatio == 1) &&
30223 "Must not have a ratio for both incoming and op masks!");
30225 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
30226 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
30227 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
30228 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
30229 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
30231 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
30233 // Merge this shuffle operation's mask into our accumulated mask. Note that
30234 // this shuffle's mask will be the first applied to the input, followed by the
30235 // root mask to get us all the way to the root value arrangement. The reason
30236 // for this order is that we are recursing up the operation chain.
30237 for (unsigned i = 0; i < MaskWidth; ++i) {
30238 unsigned RootIdx = i >> RootRatioLog2;
30239 if (RootMask[RootIdx] < 0) {
30240 // This is a zero or undef lane, we're done.
30241 Mask[i] = RootMask[RootIdx];
30245 unsigned RootMaskedIdx =
30247 ? RootMask[RootIdx]
30248 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
30250 // Just insert the scaled root mask value if it references an input other
30251 // than the SrcOp we're currently inserting.
30252 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
30253 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
30254 Mask[i] = RootMaskedIdx;
30258 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
30259 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
30260 if (OpMask[OpIdx] < 0) {
30261 // The incoming lanes are zero or undef, it doesn't matter which ones we
30263 Mask[i] = OpMask[OpIdx];
30267 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
30268 unsigned OpMaskedIdx =
30271 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
30273 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
30274 if (OpMask[OpIdx] < (int)OpMask.size()) {
30275 assert(0 <= InputIdx0 && "Unknown target shuffle input");
30276 OpMaskedIdx += InputIdx0 * MaskWidth;
30278 assert(0 <= InputIdx1 && "Unknown target shuffle input");
30279 OpMaskedIdx += InputIdx1 * MaskWidth;
30282 Mask[i] = OpMaskedIdx;
30285 // Handle the all undef/zero cases early.
30286 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
30287 return DAG.getUNDEF(Root.getValueType());
30289 // TODO - should we handle the mixed zero/undef case as well? Just returning
30290 // a zero mask will lose information on undef elements possibly reducing
30291 // future combine possibilities.
30292 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
30293 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
30296 // Remove unused shuffle source ops.
30297 resolveTargetShuffleInputsAndMask(Ops, Mask);
30298 assert(!Ops.empty() && "Shuffle with no inputs detected");
30300 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
30302 // Update the list of shuffle nodes that have been combined so far.
30303 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
30305 CombinedNodes.push_back(Op.getNode());
30307 // See if we can recurse into each shuffle source op (if it's a target
30308 // shuffle). The source op should only be combined if it either has a
30309 // single use (i.e. current Op) or all its users have already been combined.
30310 // Don't recurse if we already have more source ops than we can combine in
30311 // the remaining recursion depth.
30312 if (Ops.size() < (MaxRecursionDepth - Depth)) {
30313 for (int i = 0, e = Ops.size(); i < e; ++i)
30314 if (Ops[i].getNode()->hasOneUse() ||
30315 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
30316 if (SDValue Res = combineX86ShufflesRecursively(
30317 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
30322 // Attempt to constant fold all of the constant source ops.
30323 if (SDValue Cst = combineX86ShufflesConstants(
30324 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
30327 // We can only combine unary and binary shuffle mask cases.
30328 if (Ops.size() > 2)
30331 // Minor canonicalization of the accumulated shuffle mask to make it easier
30332 // to match below. All this does is detect masks with sequential pairs of
30333 // elements, and shrink them to the half-width mask. It does this in a loop
30334 // so it will reduce the size of the mask to the minimal width mask which
30335 // performs an equivalent shuffle.
30336 SmallVector<int, 64> WidenedMask;
30337 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
30338 Mask = std::move(WidenedMask);
30341 // Canonicalization of binary shuffle masks to improve pattern matching by
30342 // commuting the inputs.
30343 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
30344 ShuffleVectorSDNode::commuteMask(Mask);
30345 std::swap(Ops[0], Ops[1]);
30348 // Finally, try to combine into a single shuffle instruction.
30349 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
30353 /// Get the PSHUF-style mask from PSHUF node.
30355 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
30356 /// PSHUF-style masks that can be reused with such instructions.
30357 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
30358 MVT VT = N.getSimpleValueType();
30359 SmallVector<int, 4> Mask;
30360 SmallVector<SDValue, 2> Ops;
30363 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
30367 // If we have more than 128-bits, only the low 128-bits of shuffle mask
30368 // matter. Check that the upper masks are repeats and remove them.
30369 if (VT.getSizeInBits() > 128) {
30370 int LaneElts = 128 / VT.getScalarSizeInBits();
30372 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
30373 for (int j = 0; j < LaneElts; ++j)
30374 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
30375 "Mask doesn't repeat in high 128-bit lanes!");
30377 Mask.resize(LaneElts);
30380 switch (N.getOpcode()) {
30381 case X86ISD::PSHUFD:
30383 case X86ISD::PSHUFLW:
30386 case X86ISD::PSHUFHW:
30387 Mask.erase(Mask.begin(), Mask.begin() + 4);
30388 for (int &M : Mask)
30392 llvm_unreachable("No valid shuffle instruction found!");
30396 /// Search for a combinable shuffle across a chain ending in pshufd.
30398 /// We walk up the chain and look for a combinable shuffle, skipping over
30399 /// shuffles that we could hoist this shuffle's transformation past without
30400 /// altering anything.
30402 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
30403 SelectionDAG &DAG) {
30404 assert(N.getOpcode() == X86ISD::PSHUFD &&
30405 "Called with something other than an x86 128-bit half shuffle!");
30408 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
30409 // of the shuffles in the chain so that we can form a fresh chain to replace
30411 SmallVector<SDValue, 8> Chain;
30412 SDValue V = N.getOperand(0);
30413 for (; V.hasOneUse(); V = V.getOperand(0)) {
30414 switch (V.getOpcode()) {
30416 return SDValue(); // Nothing combined!
30419 // Skip bitcasts as we always know the type for the target specific
30423 case X86ISD::PSHUFD:
30424 // Found another dword shuffle.
30427 case X86ISD::PSHUFLW:
30428 // Check that the low words (being shuffled) are the identity in the
30429 // dword shuffle, and the high words are self-contained.
30430 if (Mask[0] != 0 || Mask[1] != 1 ||
30431 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
30434 Chain.push_back(V);
30437 case X86ISD::PSHUFHW:
30438 // Check that the high words (being shuffled) are the identity in the
30439 // dword shuffle, and the low words are self-contained.
30440 if (Mask[2] != 2 || Mask[3] != 3 ||
30441 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
30444 Chain.push_back(V);
30447 case X86ISD::UNPCKL:
30448 case X86ISD::UNPCKH:
30449 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
30450 // shuffle into a preceding word shuffle.
30451 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
30452 V.getSimpleValueType().getVectorElementType() != MVT::i16)
30455 // Search for a half-shuffle which we can combine with.
30456 unsigned CombineOp =
30457 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
30458 if (V.getOperand(0) != V.getOperand(1) ||
30459 !V->isOnlyUserOf(V.getOperand(0).getNode()))
30461 Chain.push_back(V);
30462 V = V.getOperand(0);
30464 switch (V.getOpcode()) {
30466 return SDValue(); // Nothing to combine.
30468 case X86ISD::PSHUFLW:
30469 case X86ISD::PSHUFHW:
30470 if (V.getOpcode() == CombineOp)
30473 Chain.push_back(V);
30477 V = V.getOperand(0);
30481 } while (V.hasOneUse());
30484 // Break out of the loop if we break out of the switch.
30488 if (!V.hasOneUse())
30489 // We fell out of the loop without finding a viable combining instruction.
30492 // Merge this node's mask and our incoming mask.
30493 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30494 for (int &M : Mask)
30496 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
30497 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30499 // Rebuild the chain around this new shuffle.
30500 while (!Chain.empty()) {
30501 SDValue W = Chain.pop_back_val();
30503 if (V.getValueType() != W.getOperand(0).getValueType())
30504 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
30506 switch (W.getOpcode()) {
30508 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
30510 case X86ISD::UNPCKL:
30511 case X86ISD::UNPCKH:
30512 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
30515 case X86ISD::PSHUFD:
30516 case X86ISD::PSHUFLW:
30517 case X86ISD::PSHUFHW:
30518 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
30522 if (V.getValueType() != N.getValueType())
30523 V = DAG.getBitcast(N.getValueType(), V);
30525 // Return the new chain to replace N.
30529 /// Search for a combinable shuffle across a chain ending in pshuflw or
30532 /// We walk up the chain, skipping shuffles of the other half and looking
30533 /// through shuffles which switch halves trying to find a shuffle of the same
30534 /// pair of dwords.
30535 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
30537 TargetLowering::DAGCombinerInfo &DCI) {
30539 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
30540 "Called with something other than an x86 128-bit half shuffle!");
30542 unsigned CombineOpcode = N.getOpcode();
30544 // Walk up a single-use chain looking for a combinable shuffle.
30545 SDValue V = N.getOperand(0);
30546 for (; V.hasOneUse(); V = V.getOperand(0)) {
30547 switch (V.getOpcode()) {
30549 return false; // Nothing combined!
30552 // Skip bitcasts as we always know the type for the target specific
30556 case X86ISD::PSHUFLW:
30557 case X86ISD::PSHUFHW:
30558 if (V.getOpcode() == CombineOpcode)
30561 // Other-half shuffles are no-ops.
30564 // Break out of the loop if we break out of the switch.
30568 if (!V.hasOneUse())
30569 // We fell out of the loop without finding a viable combining instruction.
30572 // Combine away the bottom node as its shuffle will be accumulated into
30573 // a preceding shuffle.
30574 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
30576 // Record the old value.
30579 // Merge this node's mask and our incoming mask (adjusted to account for all
30580 // the pshufd instructions encountered).
30581 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30582 for (int &M : Mask)
30584 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
30585 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30587 // Check that the shuffles didn't cancel each other out. If not, we need to
30588 // combine to the new one.
30590 // Replace the combinable shuffle with the combined one, updating all users
30591 // so that we re-evaluate the chain here.
30592 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
30597 /// Try to combine x86 target specific shuffles.
30598 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
30599 TargetLowering::DAGCombinerInfo &DCI,
30600 const X86Subtarget &Subtarget) {
30602 MVT VT = N.getSimpleValueType();
30603 SmallVector<int, 4> Mask;
30604 unsigned Opcode = N.getOpcode();
30606 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
30607 // single instruction.
30608 if (VT.getScalarSizeInBits() == 64 &&
30609 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
30610 Opcode == X86ISD::UNPCKL)) {
30611 auto BC0 = peekThroughBitcasts(N.getOperand(0));
30612 auto BC1 = peekThroughBitcasts(N.getOperand(1));
30613 EVT VT0 = BC0.getValueType();
30614 EVT VT1 = BC1.getValueType();
30615 unsigned Opcode0 = BC0.getOpcode();
30616 unsigned Opcode1 = BC1.getOpcode();
30617 if (Opcode0 == Opcode1 && VT0 == VT1 &&
30618 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
30619 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
30620 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
30622 if (Opcode == X86ISD::MOVSD) {
30623 Lo = BC1.getOperand(0);
30624 Hi = BC0.getOperand(1);
30626 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30627 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30629 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
30630 return DAG.getBitcast(VT, Horiz);
30635 case X86ISD::VBROADCAST: {
30636 // If broadcasting from another shuffle, attempt to simplify it.
30637 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
30638 SDValue Src = N.getOperand(0);
30639 SDValue BC = peekThroughBitcasts(Src);
30640 EVT SrcVT = Src.getValueType();
30641 EVT BCVT = BC.getValueType();
30642 if (isTargetShuffle(BC.getOpcode()) &&
30643 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
30644 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
30645 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
30647 for (unsigned i = 0; i != Scale; ++i)
30648 DemandedMask[i] = i;
30649 if (SDValue Res = combineX86ShufflesRecursively(
30650 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
30651 /*HasVarMask*/ false, DAG, Subtarget))
30652 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
30653 DAG.getBitcast(SrcVT, Res));
30657 case X86ISD::PSHUFD:
30658 case X86ISD::PSHUFLW:
30659 case X86ISD::PSHUFHW:
30660 Mask = getPSHUFShuffleMask(N);
30661 assert(Mask.size() == 4);
30663 case X86ISD::UNPCKL: {
30664 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
30665 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
30666 // moves upper half elements into the lower half part. For example:
30668 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
30670 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
30672 // will be combined to:
30674 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
30676 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
30677 // happen due to advanced instructions.
30678 if (!VT.is128BitVector())
30681 auto Op0 = N.getOperand(0);
30682 auto Op1 = N.getOperand(1);
30683 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
30684 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
30686 unsigned NumElts = VT.getVectorNumElements();
30687 SmallVector<int, 8> ExpectedMask(NumElts, -1);
30688 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
30691 auto ShufOp = Op1.getOperand(0);
30692 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
30693 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
30697 case X86ISD::MOVSD:
30698 case X86ISD::MOVSS: {
30699 SDValue N0 = N.getOperand(0);
30700 SDValue N1 = N.getOperand(1);
30702 // Canonicalize scalar FPOps:
30703 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
30704 // If commutable, allow OP(N1[0], N0[0]).
30705 unsigned Opcode1 = N1.getOpcode();
30706 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
30707 Opcode1 == ISD::FDIV) {
30708 SDValue N10 = N1.getOperand(0);
30709 SDValue N11 = N1.getOperand(1);
30711 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
30713 std::swap(N10, N11);
30714 MVT SVT = VT.getVectorElementType();
30715 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
30716 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
30717 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
30718 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
30719 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
30720 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
30726 case X86ISD::INSERTPS: {
30727 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
30728 SDValue Op0 = N.getOperand(0);
30729 SDValue Op1 = N.getOperand(1);
30730 SDValue Op2 = N.getOperand(2);
30731 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
30732 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
30733 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
30734 unsigned ZeroMask = InsertPSMask & 0xF;
30736 // If we zero out all elements from Op0 then we don't need to reference it.
30737 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
30738 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
30739 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30741 // If we zero out the element from Op1 then we don't need to reference it.
30742 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
30743 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30744 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30746 // Attempt to merge insertps Op1 with an inner target shuffle node.
30747 SmallVector<int, 8> TargetMask1;
30748 SmallVector<SDValue, 2> Ops1;
30749 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
30750 int M = TargetMask1[SrcIdx];
30751 if (isUndefOrZero(M)) {
30752 // Zero/UNDEF insertion - zero out element and remove dependency.
30753 InsertPSMask |= (1u << DstIdx);
30754 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30755 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30757 // Update insertps mask srcidx and reference the source input directly.
30758 assert(0 <= M && M < 8 && "Shuffle index out of range");
30759 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
30760 Op1 = Ops1[M < 4 ? 0 : 1];
30761 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30762 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30765 // Attempt to merge insertps Op0 with an inner target shuffle node.
30766 SmallVector<int, 8> TargetMask0;
30767 SmallVector<SDValue, 2> Ops0;
30768 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
30771 bool Updated = false;
30772 bool UseInput00 = false;
30773 bool UseInput01 = false;
30774 for (int i = 0; i != 4; ++i) {
30775 int M = TargetMask0[i];
30776 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
30777 // No change if element is already zero or the inserted element.
30779 } else if (isUndefOrZero(M)) {
30780 // If the target mask is undef/zero then we must zero the element.
30781 InsertPSMask |= (1u << i);
30786 // The input vector element must be inline.
30787 if (M != i && M != (i + 4))
30790 // Determine which inputs of the target shuffle we're using.
30791 UseInput00 |= (0 <= M && M < 4);
30792 UseInput01 |= (4 <= M);
30795 // If we're not using both inputs of the target shuffle then use the
30796 // referenced input directly.
30797 if (UseInput00 && !UseInput01) {
30800 } else if (!UseInput00 && UseInput01) {
30806 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30807 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30815 // Nuke no-op shuffles that show up after combining.
30816 if (isNoopShuffleMask(Mask))
30817 return N.getOperand(0);
30819 // Look for simplifications involving one or two shuffle instructions.
30820 SDValue V = N.getOperand(0);
30821 switch (N.getOpcode()) {
30824 case X86ISD::PSHUFLW:
30825 case X86ISD::PSHUFHW:
30826 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
30828 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
30829 return SDValue(); // We combined away this shuffle, so we're done.
30831 // See if this reduces to a PSHUFD which is no more expensive and can
30832 // combine with more operations. Note that it has to at least flip the
30833 // dwords as otherwise it would have been removed as a no-op.
30834 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
30835 int DMask[] = {0, 1, 2, 3};
30836 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
30837 DMask[DOffset + 0] = DOffset + 1;
30838 DMask[DOffset + 1] = DOffset + 0;
30839 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30840 V = DAG.getBitcast(DVT, V);
30841 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
30842 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
30843 return DAG.getBitcast(VT, V);
30846 // Look for shuffle patterns which can be implemented as a single unpack.
30847 // FIXME: This doesn't handle the location of the PSHUFD generically, and
30848 // only works when we have a PSHUFD followed by two half-shuffles.
30849 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
30850 (V.getOpcode() == X86ISD::PSHUFLW ||
30851 V.getOpcode() == X86ISD::PSHUFHW) &&
30852 V.getOpcode() != N.getOpcode() &&
30854 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
30855 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
30856 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30857 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
30858 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30859 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30861 for (int i = 0; i < 4; ++i) {
30862 WordMask[i + NOffset] = Mask[i] + NOffset;
30863 WordMask[i + VOffset] = VMask[i] + VOffset;
30865 // Map the word mask through the DWord mask.
30867 for (int i = 0; i < 8; ++i)
30868 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
30869 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
30870 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
30871 // We can replace all three shuffles with an unpack.
30872 V = DAG.getBitcast(VT, D.getOperand(0));
30873 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
30882 case X86ISD::PSHUFD:
30883 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
30892 /// Checks if the shuffle mask takes subsequent elements
30893 /// alternately from two vectors.
30894 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
30895 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
30897 int ParitySrc[2] = {-1, -1};
30898 unsigned Size = Mask.size();
30899 for (unsigned i = 0; i != Size; ++i) {
30904 // Make sure we are using the matching element from the input.
30905 if ((M % Size) != i)
30908 // Make sure we use the same input for all elements of the same parity.
30909 int Src = M / Size;
30910 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
30912 ParitySrc[i % 2] = Src;
30915 // Make sure each input is used.
30916 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
30919 Op0Even = ParitySrc[0] == 0;
30923 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
30924 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
30925 /// are written to the parameters \p Opnd0 and \p Opnd1.
30927 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
30928 /// so it is easier to generically match. We also insert dummy vector shuffle
30929 /// nodes for the operands which explicitly discard the lanes which are unused
30930 /// by this operation to try to flow through the rest of the combiner
30931 /// the fact that they're unused.
30932 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
30933 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
30936 EVT VT = N->getValueType(0);
30937 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30938 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
30939 !VT.getSimpleVT().isFloatingPoint())
30942 // We only handle target-independent shuffles.
30943 // FIXME: It would be easy and harmless to use the target shuffle mask
30944 // extraction tool to support more.
30945 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30948 SDValue V1 = N->getOperand(0);
30949 SDValue V2 = N->getOperand(1);
30951 // Make sure we have an FADD and an FSUB.
30952 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
30953 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
30954 V1.getOpcode() == V2.getOpcode())
30957 // If there are other uses of these operations we can't fold them.
30958 if (!V1->hasOneUse() || !V2->hasOneUse())
30961 // Ensure that both operations have the same operands. Note that we can
30962 // commute the FADD operands.
30964 if (V1.getOpcode() == ISD::FSUB) {
30965 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
30966 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
30967 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
30970 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
30971 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
30972 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
30973 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
30977 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30979 if (!isAddSubOrSubAddMask(Mask, Op0Even))
30982 // It's a subadd if the vector in the even parity is an FADD.
30983 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
30984 : V2->getOpcode() == ISD::FADD;
30991 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
30992 static SDValue combineShuffleToFMAddSub(SDNode *N,
30993 const X86Subtarget &Subtarget,
30994 SelectionDAG &DAG) {
30995 // We only handle target-independent shuffles.
30996 // FIXME: It would be easy and harmless to use the target shuffle mask
30997 // extraction tool to support more.
30998 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
31001 MVT VT = N->getSimpleValueType(0);
31002 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31003 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
31006 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
31007 SDValue Op0 = N->getOperand(0);
31008 SDValue Op1 = N->getOperand(1);
31009 SDValue FMAdd = Op0, FMSub = Op1;
31010 if (FMSub.getOpcode() != X86ISD::FMSUB)
31011 std::swap(FMAdd, FMSub);
31013 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
31014 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
31015 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
31016 FMAdd.getOperand(2) != FMSub.getOperand(2))
31019 // Check for correct shuffle mask.
31020 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31022 if (!isAddSubOrSubAddMask(Mask, Op0Even))
31025 // FMAddSub takes zeroth operand from FMSub node.
31027 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
31028 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
31029 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
31030 FMAdd.getOperand(2));
31033 /// Try to combine a shuffle into a target-specific add-sub or
31034 /// mul-add-sub node.
31035 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
31036 const X86Subtarget &Subtarget,
31037 SelectionDAG &DAG) {
31038 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
31041 SDValue Opnd0, Opnd1;
31043 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
31046 MVT VT = N->getSimpleValueType(0);
31049 // Try to generate X86ISD::FMADDSUB node here.
31051 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
31052 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
31053 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
31059 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
31060 // the ADDSUB idiom has been successfully recognized. There are no known
31061 // X86 targets with 512-bit ADDSUB instructions!
31062 if (VT.is512BitVector())
31065 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
31068 // We are looking for a shuffle where both sources are concatenated with undef
31069 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
31070 // if we can express this as a single-source shuffle, that's preferable.
31071 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
31072 const X86Subtarget &Subtarget) {
31073 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
31076 EVT VT = N->getValueType(0);
31078 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
31079 if (!VT.is128BitVector() && !VT.is256BitVector())
31082 if (VT.getVectorElementType() != MVT::i32 &&
31083 VT.getVectorElementType() != MVT::i64 &&
31084 VT.getVectorElementType() != MVT::f32 &&
31085 VT.getVectorElementType() != MVT::f64)
31088 SDValue N0 = N->getOperand(0);
31089 SDValue N1 = N->getOperand(1);
31091 // Check that both sources are concats with undef.
31092 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
31093 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
31094 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
31095 !N1.getOperand(1).isUndef())
31098 // Construct the new shuffle mask. Elements from the first source retain their
31099 // index, but elements from the second source no longer need to skip an undef.
31100 SmallVector<int, 8> Mask;
31101 int NumElts = VT.getVectorNumElements();
31103 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31104 for (int Elt : SVOp->getMask())
31105 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
31108 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
31110 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
31113 /// Eliminate a redundant shuffle of a horizontal math op.
31114 static SDValue foldShuffleOfHorizOp(SDNode *N) {
31115 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
31118 SDValue HOp = N->getOperand(0);
31119 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
31120 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
31123 // 128-bit horizontal math instructions are defined to operate on adjacent
31124 // lanes of each operand as:
31125 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
31126 // ...similarly for v2f64 and v8i16.
31127 // TODO: Handle UNDEF operands.
31128 if (HOp.getOperand(0) != HOp.getOperand(1))
31131 // When the operands of a horizontal math op are identical, the low half of
31132 // the result is the same as the high half. If the shuffle is also replicating
31133 // low and high halves, we don't need the shuffle.
31134 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
31135 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31136 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
31137 // but this should be tied to whatever horizontal op matching and shuffle
31138 // canonicalization are producing.
31139 if (HOp.getValueSizeInBits() == 128 &&
31140 (isTargetShuffleEquivalent(Mask, {0, 0}) ||
31141 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
31142 isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
31145 if (HOp.getValueSizeInBits() == 256 &&
31146 (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
31147 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
31148 isTargetShuffleEquivalent(
31149 Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
31155 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
31156 TargetLowering::DAGCombinerInfo &DCI,
31157 const X86Subtarget &Subtarget) {
31159 EVT VT = N->getValueType(0);
31160 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31161 // If we have legalized the vector types, look for blends of FADD and FSUB
31162 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
31163 if (TLI.isTypeLegal(VT)) {
31164 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
31167 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
31171 // During Type Legalization, when promoting illegal vector types,
31172 // the backend might introduce new shuffle dag nodes and bitcasts.
31174 // This code performs the following transformation:
31175 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
31176 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
31178 // We do this only if both the bitcast and the BINOP dag nodes have
31179 // one use. Also, perform this transformation only if the new binary
31180 // operation is legal. This is to avoid introducing dag nodes that
31181 // potentially need to be further expanded (or custom lowered) into a
31182 // less optimal sequence of dag nodes.
31183 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
31184 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
31185 N->getOperand(0).getOpcode() == ISD::BITCAST &&
31186 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
31187 SDValue N0 = N->getOperand(0);
31188 SDValue N1 = N->getOperand(1);
31190 SDValue BC0 = N0.getOperand(0);
31191 EVT SVT = BC0.getValueType();
31192 unsigned Opcode = BC0.getOpcode();
31193 unsigned NumElts = VT.getVectorNumElements();
31195 if (BC0.hasOneUse() && SVT.isVector() &&
31196 SVT.getVectorNumElements() * 2 == NumElts &&
31197 TLI.isOperationLegal(Opcode, VT)) {
31198 bool CanFold = false;
31204 // isOperationLegal lies for integer ops on floating point types.
31205 CanFold = VT.isInteger();
31210 // isOperationLegal lies for floating point ops on integer types.
31211 CanFold = VT.isFloatingPoint();
31215 unsigned SVTNumElts = SVT.getVectorNumElements();
31216 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31217 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
31218 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
31219 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
31220 CanFold = SVOp->getMaskElt(i) < 0;
31223 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
31224 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
31225 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
31226 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
31231 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
31232 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
31233 // consecutive, non-overlapping, and in the right order.
31234 SmallVector<SDValue, 16> Elts;
31235 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
31236 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
31237 Elts.push_back(Elt);
31244 if (Elts.size() == VT.getVectorNumElements())
31246 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
31249 // For AVX2, we sometimes want to combine
31250 // (vector_shuffle <mask> (concat_vectors t1, undef)
31251 // (concat_vectors t2, undef))
31253 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
31254 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
31255 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
31258 if (isTargetShuffle(N->getOpcode())) {
31260 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
31263 // Try recursively combining arbitrary sequences of x86 shuffle
31264 // instructions into higher-order shuffles. We do this after combining
31265 // specific PSHUF instruction sequences into their minimal form so that we
31266 // can evaluate how many specialized shuffle instructions are involved in
31267 // a particular chain.
31268 if (SDValue Res = combineX86ShufflesRecursively(
31269 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
31270 /*HasVarMask*/ false, DAG, Subtarget))
31277 /// Check if a vector extract from a target-specific shuffle of a load can be
31278 /// folded into a single element load.
31279 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
31280 /// shuffles have been custom lowered so we need to handle those here.
31281 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
31282 TargetLowering::DAGCombinerInfo &DCI) {
31283 if (DCI.isBeforeLegalizeOps())
31286 SDValue InVec = N->getOperand(0);
31287 SDValue EltNo = N->getOperand(1);
31288 EVT EltVT = N->getValueType(0);
31290 if (!isa<ConstantSDNode>(EltNo))
31293 EVT OriginalVT = InVec.getValueType();
31295 // Peek through bitcasts, don't duplicate a load with other uses.
31296 InVec = peekThroughOneUseBitcasts(InVec);
31298 EVT CurrentVT = InVec.getValueType();
31299 if (!CurrentVT.isVector() ||
31300 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
31303 if (!isTargetShuffle(InVec.getOpcode()))
31306 // Don't duplicate a load with other uses.
31307 if (!InVec.hasOneUse())
31310 SmallVector<int, 16> ShuffleMask;
31311 SmallVector<SDValue, 2> ShuffleOps;
31313 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
31314 ShuffleOps, ShuffleMask, UnaryShuffle))
31317 // Select the input vector, guarding against out of range extract vector.
31318 unsigned NumElems = CurrentVT.getVectorNumElements();
31319 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
31320 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
31322 if (Idx == SM_SentinelZero)
31323 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
31324 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
31325 if (Idx == SM_SentinelUndef)
31326 return DAG.getUNDEF(EltVT);
31328 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
31329 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
31332 // If inputs to shuffle are the same for both ops, then allow 2 uses
31333 unsigned AllowedUses =
31334 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
31336 if (LdNode.getOpcode() == ISD::BITCAST) {
31337 // Don't duplicate a load with other uses.
31338 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
31341 AllowedUses = 1; // only allow 1 load use if we have a bitcast
31342 LdNode = LdNode.getOperand(0);
31345 if (!ISD::isNormalLoad(LdNode.getNode()))
31348 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
31350 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
31353 // If there's a bitcast before the shuffle, check if the load type and
31354 // alignment is valid.
31355 unsigned Align = LN0->getAlignment();
31356 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31357 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
31358 EltVT.getTypeForEVT(*DAG.getContext()));
31360 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
31363 // All checks match so transform back to vector_shuffle so that DAG combiner
31364 // can finish the job
31367 // Create shuffle node taking into account the case that its a unary shuffle
31368 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
31369 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
31371 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
31372 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
31376 // Try to match patterns such as
31377 // (i16 bitcast (v16i1 x))
31379 // (i16 movmsk (16i8 sext (v16i1 x)))
31380 // before the illegal vector is scalarized on subtargets that don't have legal
31382 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
31383 const X86Subtarget &Subtarget) {
31384 EVT VT = BitCast.getValueType();
31385 SDValue N0 = BitCast.getOperand(0);
31386 EVT VecVT = N0->getValueType(0);
31388 if (!VT.isScalarInteger() || !VecVT.isSimple())
31391 // With AVX512 vxi1 types are legal and we prefer using k-regs.
31392 // MOVMSK is supported in SSE2 or later.
31393 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
31396 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
31397 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
31398 // v8i16 and v16i16.
31399 // For these two cases, we can shuffle the upper element bytes to a
31400 // consecutive sequence at the start of the vector and treat the results as
31401 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
31402 // for v16i16 this is not the case, because the shuffle is expensive, so we
31403 // avoid sign-extending to this type entirely.
31404 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
31405 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
31407 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
31408 switch (VecVT.getSimpleVT().SimpleTy) {
31412 SExtVT = MVT::v2i64;
31413 FPCastVT = MVT::v2f64;
31416 SExtVT = MVT::v4i32;
31417 FPCastVT = MVT::v4f32;
31418 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
31419 // sign-extend to a 256-bit operation to avoid truncation.
31420 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31421 N0->getOperand(0).getValueType().is256BitVector()) {
31422 SExtVT = MVT::v4i64;
31423 FPCastVT = MVT::v4f64;
31427 SExtVT = MVT::v8i16;
31428 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
31429 // sign-extend to a 256-bit operation to match the compare.
31430 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
31431 // 256-bit because the shuffle is cheaper than sign extending the result of
31433 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31434 (N0->getOperand(0).getValueType().is256BitVector() ||
31435 N0->getOperand(0).getValueType().is512BitVector())) {
31436 SExtVT = MVT::v8i32;
31437 FPCastVT = MVT::v8f32;
31441 SExtVT = MVT::v16i8;
31442 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
31443 // it is not profitable to sign-extend to 256-bit because this will
31444 // require an extra cross-lane shuffle which is more expensive than
31445 // truncating the result of the compare to 128-bits.
31448 SExtVT = MVT::v32i8;
31453 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
31455 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
31456 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31457 return DAG.getZExtOrTrunc(V, DL, VT);
31460 if (SExtVT == MVT::v8i16) {
31461 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
31462 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
31463 DAG.getUNDEF(MVT::v8i16));
31465 assert(SExtVT.getScalarType() != MVT::i16 &&
31466 "Vectors of i16 must be packed");
31467 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
31468 V = DAG.getBitcast(FPCastVT, V);
31469 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31470 return DAG.getZExtOrTrunc(V, DL, VT);
31473 // Convert a vXi1 constant build vector to the same width scalar integer.
31474 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
31475 EVT SrcVT = Op.getValueType();
31476 assert(SrcVT.getVectorElementType() == MVT::i1 &&
31477 "Expected a vXi1 vector");
31478 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
31479 "Expected a constant build vector");
31481 APInt Imm(SrcVT.getVectorNumElements(), 0);
31482 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
31483 SDValue In = Op.getOperand(Idx);
31484 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
31487 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
31488 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
31491 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31492 TargetLowering::DAGCombinerInfo &DCI,
31493 const X86Subtarget &Subtarget) {
31494 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
31496 if (!DCI.isBeforeLegalizeOps())
31499 // Only do this if we have k-registers.
31500 if (!Subtarget.hasAVX512())
31503 EVT DstVT = N->getValueType(0);
31504 SDValue Op = N->getOperand(0);
31505 EVT SrcVT = Op.getValueType();
31507 if (!Op.hasOneUse())
31510 // Look for logic ops.
31511 if (Op.getOpcode() != ISD::AND &&
31512 Op.getOpcode() != ISD::OR &&
31513 Op.getOpcode() != ISD::XOR)
31516 // Make sure we have a bitcast between mask registers and a scalar type.
31517 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31518 DstVT.isScalarInteger()) &&
31519 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
31520 SrcVT.isScalarInteger()))
31523 SDValue LHS = Op.getOperand(0);
31524 SDValue RHS = Op.getOperand(1);
31526 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
31527 LHS.getOperand(0).getValueType() == DstVT)
31528 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
31529 DAG.getBitcast(DstVT, RHS));
31531 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
31532 RHS.getOperand(0).getValueType() == DstVT)
31533 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31534 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
31536 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
31537 // Most of these have to move a constant from the scalar domain anyway.
31538 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
31539 RHS = combinevXi1ConstantToInteger(RHS, DAG);
31540 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31541 DAG.getBitcast(DstVT, LHS), RHS);
31547 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
31548 const X86Subtarget &Subtarget) {
31550 unsigned NumElts = N.getNumOperands();
31552 auto *BV = cast<BuildVectorSDNode>(N);
31553 SDValue Splat = BV->getSplatValue();
31555 // Build MMX element from integer GPR or SSE float values.
31556 auto CreateMMXElement = [&](SDValue V) {
31558 return DAG.getUNDEF(MVT::x86mmx);
31559 if (V.getValueType().isFloatingPoint()) {
31560 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
31561 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
31562 V = DAG.getBitcast(MVT::v2i64, V);
31563 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
31565 V = DAG.getBitcast(MVT::i32, V);
31567 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
31569 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
31572 // Convert build vector ops to MMX data in the bottom elements.
31573 SmallVector<SDValue, 8> Ops;
31575 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
31577 if (Splat.isUndef())
31578 return DAG.getUNDEF(MVT::x86mmx);
31580 Splat = CreateMMXElement(Splat);
31582 if (Subtarget.hasSSE1()) {
31583 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
31585 Splat = DAG.getNode(
31586 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31587 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
31590 // Use PSHUFW to repeat 16-bit elements.
31591 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
31592 return DAG.getNode(
31593 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31594 DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
31595 DAG.getConstant(ShufMask, DL, MVT::i8));
31597 Ops.append(NumElts, Splat);
31599 for (unsigned i = 0; i != NumElts; ++i)
31600 Ops.push_back(CreateMMXElement(N.getOperand(i)));
31603 // Use tree of PUNPCKLs to build up general MMX vector.
31604 while (Ops.size() > 1) {
31605 unsigned NumOps = Ops.size();
31606 unsigned IntrinOp =
31607 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
31608 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
31609 : Intrinsic::x86_mmx_punpcklbw));
31610 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
31611 for (unsigned i = 0; i != NumOps; i += 2)
31612 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
31613 Ops[i], Ops[i + 1]);
31614 Ops.resize(NumOps / 2);
31620 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
31621 TargetLowering::DAGCombinerInfo &DCI,
31622 const X86Subtarget &Subtarget) {
31623 SDValue N0 = N->getOperand(0);
31624 EVT VT = N->getValueType(0);
31625 EVT SrcVT = N0.getValueType();
31627 // Try to match patterns such as
31628 // (i16 bitcast (v16i1 x))
31630 // (i16 movmsk (16i8 sext (v16i1 x)))
31631 // before the setcc result is scalarized on subtargets that don't have legal
31633 if (DCI.isBeforeLegalize()) {
31634 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
31637 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31638 // type, widen both sides to avoid a trip through memory.
31639 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
31640 Subtarget.hasAVX512()) {
31642 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
31643 N0 = DAG.getBitcast(MVT::v8i1, N0);
31644 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
31645 DAG.getIntPtrConstant(0, dl));
31648 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31649 // type, widen both sides to avoid a trip through memory.
31650 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
31651 Subtarget.hasAVX512()) {
31653 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
31654 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
31656 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
31657 N0 = DAG.getBitcast(MVT::i8, N0);
31658 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
31662 // Since MMX types are special and don't usually play with other vector types,
31663 // it's better to handle them early to be sure we emit efficient code by
31664 // avoiding store-load conversions.
31665 if (VT == MVT::x86mmx) {
31666 // Detect MMX constant vectors.
31668 SmallVector<APInt, 1> EltBits;
31669 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
31671 // Handle zero-extension of i32 with MOVD.
31672 if (EltBits[0].countLeadingZeros() >= 32)
31673 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
31674 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
31675 // Else, bitcast to a double.
31676 // TODO - investigate supporting sext 32-bit immediates on x86_64.
31677 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
31678 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
31681 // Detect bitcasts to x86mmx low word.
31682 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31683 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
31684 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
31685 bool LowUndef = true, AllUndefOrZero = true;
31686 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
31687 SDValue Op = N0.getOperand(i);
31688 LowUndef &= Op.isUndef() || (i >= e/2);
31689 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
31691 if (AllUndefOrZero) {
31692 SDValue N00 = N0.getOperand(0);
31694 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
31695 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
31696 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
31700 // Detect bitcasts of 64-bit build vectors and convert to a
31701 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
31703 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31704 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
31705 SrcVT == MVT::v8i8))
31706 return createMMXBuildVector(N0, DAG, Subtarget);
31708 // Detect bitcasts between element or subvector extraction to x86mmx.
31709 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
31710 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
31711 isNullConstant(N0.getOperand(1))) {
31712 SDValue N00 = N0.getOperand(0);
31713 if (N00.getValueType().is128BitVector())
31714 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
31715 DAG.getBitcast(MVT::v2i64, N00));
31718 // Detect bitcasts from FP_TO_SINT to x86mmx.
31719 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
31721 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
31722 DAG.getUNDEF(MVT::v2i32));
31723 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
31724 DAG.getBitcast(MVT::v2i64, Res));
31728 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
31729 // most of these to scalar anyway.
31730 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
31731 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31732 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
31733 return combinevXi1ConstantToInteger(N0, DAG);
31736 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
31737 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
31738 isa<ConstantSDNode>(N0)) {
31739 auto *C = cast<ConstantSDNode>(N0);
31740 if (C->isAllOnesValue())
31741 return DAG.getConstant(1, SDLoc(N0), VT);
31742 if (C->isNullValue())
31743 return DAG.getConstant(0, SDLoc(N0), VT);
31746 // Try to remove bitcasts from input and output of mask arithmetic to
31747 // remove GPR<->K-register crossings.
31748 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
31751 // Convert a bitcasted integer logic operation that has one bitcasted
31752 // floating-point operand into a floating-point logic operation. This may
31753 // create a load of a constant, but that is cheaper than materializing the
31754 // constant in an integer register and transferring it to an SSE register or
31755 // transferring the SSE operand to integer register and back.
31757 switch (N0.getOpcode()) {
31758 case ISD::AND: FPOpcode = X86ISD::FAND; break;
31759 case ISD::OR: FPOpcode = X86ISD::FOR; break;
31760 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
31761 default: return SDValue();
31764 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
31765 (Subtarget.hasSSE2() && VT == MVT::f64)))
31768 SDValue LogicOp0 = N0.getOperand(0);
31769 SDValue LogicOp1 = N0.getOperand(1);
31772 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
31773 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
31774 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
31775 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
31776 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
31777 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
31779 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
31780 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
31781 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
31782 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
31783 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
31784 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
31790 // Match a binop + shuffle pyramid that represents a horizontal reduction over
31791 // the elements of a vector.
31792 // Returns the vector that is being reduced on, or SDValue() if a reduction
31793 // was not matched.
31794 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
31795 ArrayRef<ISD::NodeType> CandidateBinOps) {
31796 // The pattern must end in an extract from index 0.
31797 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
31798 !isNullConstant(Extract->getOperand(1)))
31801 SDValue Op = Extract->getOperand(0);
31802 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
31804 // Match against one of the candidate binary ops.
31805 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
31806 return Op.getOpcode() == unsigned(BinOp);
31810 // At each stage, we're looking for something that looks like:
31811 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
31812 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
31813 // i32 undef, i32 undef, i32 undef, i32 undef>
31814 // %a = binop <8 x i32> %op, %s
31815 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
31816 // we expect something like:
31817 // <4,5,6,7,u,u,u,u>
31818 // <2,3,u,u,u,u,u,u>
31819 // <1,u,u,u,u,u,u,u>
31820 unsigned CandidateBinOp = Op.getOpcode();
31821 for (unsigned i = 0; i < Stages; ++i) {
31822 if (Op.getOpcode() != CandidateBinOp)
31825 ShuffleVectorSDNode *Shuffle =
31826 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
31828 Op = Op.getOperand(1);
31830 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
31831 Op = Op.getOperand(0);
31834 // The first operand of the shuffle should be the same as the other operand
31836 if (!Shuffle || Shuffle->getOperand(0) != Op)
31839 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
31840 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
31841 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
31845 BinOp = CandidateBinOp;
31849 // Given a select, detect the following pattern:
31850 // 1: %2 = zext <N x i8> %0 to <N x i32>
31851 // 2: %3 = zext <N x i8> %1 to <N x i32>
31852 // 3: %4 = sub nsw <N x i32> %2, %3
31853 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
31854 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
31855 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
31856 // This is useful as it is the input into a SAD pattern.
31857 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
31859 // Check the condition of the select instruction is greater-than.
31860 SDValue SetCC = Select->getOperand(0);
31861 if (SetCC.getOpcode() != ISD::SETCC)
31863 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
31864 if (CC != ISD::SETGT && CC != ISD::SETLT)
31867 SDValue SelectOp1 = Select->getOperand(1);
31868 SDValue SelectOp2 = Select->getOperand(2);
31870 // The following instructions assume SelectOp1 is the subtraction operand
31871 // and SelectOp2 is the negation operand.
31872 // In the case of SETLT this is the other way around.
31873 if (CC == ISD::SETLT)
31874 std::swap(SelectOp1, SelectOp2);
31876 // The second operand of the select should be the negation of the first
31877 // operand, which is implemented as 0 - SelectOp1.
31878 if (!(SelectOp2.getOpcode() == ISD::SUB &&
31879 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
31880 SelectOp2.getOperand(1) == SelectOp1))
31883 // The first operand of SetCC is the first operand of the select, which is the
31884 // difference between the two input vectors.
31885 if (SetCC.getOperand(0) != SelectOp1)
31888 // In SetLT case, The second operand of the comparison can be either 1 or 0.
31890 if ((CC == ISD::SETLT) &&
31891 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
31892 SplatVal.isOneValue()) ||
31893 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
31896 // In SetGT case, The second operand of the comparison can be either -1 or 0.
31897 if ((CC == ISD::SETGT) &&
31898 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
31899 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
31902 // The first operand of the select is the difference between the two input
31904 if (SelectOp1.getOpcode() != ISD::SUB)
31907 Op0 = SelectOp1.getOperand(0);
31908 Op1 = SelectOp1.getOperand(1);
31910 // Check if the operands of the sub are zero-extended from vectors of i8.
31911 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
31912 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
31913 Op1.getOpcode() != ISD::ZERO_EXTEND ||
31914 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
31920 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
31922 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
31923 const SDValue &Zext1, const SDLoc &DL,
31924 const X86Subtarget &Subtarget) {
31925 // Find the appropriate width for the PSADBW.
31926 EVT InVT = Zext0.getOperand(0).getValueType();
31927 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
31929 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
31930 // fill in the missing vector elements with 0.
31931 unsigned NumConcat = RegSize / InVT.getSizeInBits();
31932 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
31933 Ops[0] = Zext0.getOperand(0);
31934 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
31935 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31936 Ops[0] = Zext1.getOperand(0);
31937 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31939 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
31940 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
31941 ArrayRef<SDValue> Ops) {
31942 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
31943 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
31945 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
31946 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
31950 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
31952 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
31953 const X86Subtarget &Subtarget) {
31954 // Bail without SSE41.
31955 if (!Subtarget.hasSSE41())
31958 EVT ExtractVT = Extract->getValueType(0);
31959 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
31962 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
31964 SDValue Src = matchBinOpReduction(
31965 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
31969 EVT SrcVT = Src.getValueType();
31970 EVT SrcSVT = SrcVT.getScalarType();
31971 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
31975 SDValue MinPos = Src;
31977 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
31978 while (SrcVT.getSizeInBits() > 128) {
31979 unsigned NumElts = SrcVT.getVectorNumElements();
31980 unsigned NumSubElts = NumElts / 2;
31981 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
31982 unsigned SubSizeInBits = SrcVT.getSizeInBits();
31983 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
31984 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
31985 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
31987 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
31988 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
31989 "Unexpected value type");
31991 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
31992 // to flip the value accordingly.
31994 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
31995 if (BinOp == ISD::SMAX)
31996 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
31997 else if (BinOp == ISD::SMIN)
31998 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
31999 else if (BinOp == ISD::UMAX)
32000 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
32003 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
32005 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
32006 // shuffling each upper element down and insert zeros. This means that the
32007 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
32008 // ready for the PHMINPOS.
32009 if (ExtractVT == MVT::i8) {
32010 SDValue Upper = DAG.getVectorShuffle(
32011 SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
32012 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
32013 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
32016 // Perform the PHMINPOS on a v8i16 vector,
32017 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
32018 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
32019 MinPos = DAG.getBitcast(SrcVT, MinPos);
32022 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
32024 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
32025 DAG.getIntPtrConstant(0, DL));
32028 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
32029 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
32031 const X86Subtarget &Subtarget) {
32032 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
32033 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
32036 EVT ExtractVT = Extract->getValueType(0);
32037 unsigned BitWidth = ExtractVT.getSizeInBits();
32038 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
32039 ExtractVT != MVT::i8)
32042 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
32043 unsigned BinOp = 0;
32044 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
32048 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
32049 // which we can't support here for now.
32050 if (Match.getScalarValueSizeInBits() != BitWidth)
32053 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
32054 unsigned MatchSizeInBits = Match.getValueSizeInBits();
32055 if (!(MatchSizeInBits == 128 ||
32056 (MatchSizeInBits == 256 &&
32057 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
32060 // Don't bother performing this for 2-element vectors.
32061 if (Match.getValueType().getVectorNumElements() <= 2)
32064 // Check that we are extracting a reduction of all sign bits.
32065 if (DAG.ComputeNumSignBits(Match) != BitWidth)
32068 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
32070 if (64 == BitWidth || 32 == BitWidth)
32071 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
32072 MatchSizeInBits / BitWidth);
32074 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
32077 ISD::CondCode CondCode;
32078 if (BinOp == ISD::OR) {
32079 // any_of -> MOVMSK != 0
32080 CompareBits = APInt::getNullValue(32);
32081 CondCode = ISD::CondCode::SETNE;
32083 // all_of -> MOVMSK == ((1 << NumElts) - 1)
32084 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
32085 CondCode = ISD::CondCode::SETEQ;
32088 // Perform the select as i32/i64 and then truncate to avoid partial register
32090 unsigned ResWidth = std::max(BitWidth, 32u);
32091 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
32093 SDValue Zero = DAG.getConstant(0, DL, ResVT);
32094 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
32095 SDValue Res = DAG.getBitcast(MaskVT, Match);
32096 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
32097 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
32098 Ones, Zero, CondCode);
32099 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
32102 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
32103 const X86Subtarget &Subtarget) {
32104 // PSADBW is only supported on SSE2 and up.
32105 if (!Subtarget.hasSSE2())
32108 // Verify the type we're extracting from is any integer type above i16.
32109 EVT VT = Extract->getOperand(0).getValueType();
32110 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
32113 unsigned RegSize = 128;
32114 if (Subtarget.useBWIRegs())
32116 else if (Subtarget.hasAVX())
32119 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
32120 // TODO: We should be able to handle larger vectors by splitting them before
32121 // feeding them into several SADs, and then reducing over those.
32122 if (RegSize / VT.getVectorNumElements() < 8)
32125 // Match shuffle + add pyramid.
32126 unsigned BinOp = 0;
32127 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
32129 // The operand is expected to be zero extended from i8
32130 // (verified in detectZextAbsDiff).
32131 // In order to convert to i64 and above, additional any/zero/sign
32132 // extend is expected.
32133 // The zero extend from 32 bit has no mathematical effect on the result.
32134 // Also the sign extend is basically zero extend
32135 // (extends the sign bit which is zero).
32136 // So it is correct to skip the sign/zero extend instruction.
32137 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
32138 Root.getOpcode() == ISD::ZERO_EXTEND ||
32139 Root.getOpcode() == ISD::ANY_EXTEND))
32140 Root = Root.getOperand(0);
32142 // If there was a match, we want Root to be a select that is the root of an
32143 // abs-diff pattern.
32144 if (!Root || (Root.getOpcode() != ISD::VSELECT))
32147 // Check whether we have an abs-diff pattern feeding into the select.
32148 SDValue Zext0, Zext1;
32149 if (!detectZextAbsDiff(Root, Zext0, Zext1))
32152 // Create the SAD instruction.
32154 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
32156 // If the original vector was wider than 8 elements, sum over the results
32157 // in the SAD vector.
32158 unsigned Stages = Log2_32(VT.getVectorNumElements());
32159 MVT SadVT = SAD.getSimpleValueType();
32161 unsigned SadElems = SadVT.getVectorNumElements();
32163 for(unsigned i = Stages - 3; i > 0; --i) {
32164 SmallVector<int, 16> Mask(SadElems, -1);
32165 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
32166 Mask[j] = MaskEnd + j;
32169 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
32170 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
32174 MVT Type = Extract->getSimpleValueType(0);
32175 unsigned TypeSizeInBits = Type.getSizeInBits();
32176 // Return the lowest TypeSizeInBits bits.
32177 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
32178 SAD = DAG.getBitcast(ResVT, SAD);
32179 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
32180 Extract->getOperand(1));
32183 // Attempt to peek through a target shuffle and extract the scalar from the
32185 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
32186 TargetLowering::DAGCombinerInfo &DCI,
32187 const X86Subtarget &Subtarget) {
32188 if (DCI.isBeforeLegalizeOps())
32191 SDValue Src = N->getOperand(0);
32192 SDValue Idx = N->getOperand(1);
32194 EVT VT = N->getValueType(0);
32195 EVT SrcVT = Src.getValueType();
32196 EVT SrcSVT = SrcVT.getVectorElementType();
32197 unsigned NumSrcElts = SrcVT.getVectorNumElements();
32199 // Don't attempt this for boolean mask vectors or unknown extraction indices.
32200 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
32203 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
32204 if (X86ISD::VBROADCAST == Src.getOpcode() &&
32205 Src.getOperand(0).getValueType() == VT)
32206 return Src.getOperand(0);
32208 // Resolve the target shuffle inputs and mask.
32209 SmallVector<int, 16> Mask;
32210 SmallVector<SDValue, 2> Ops;
32211 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
32214 // Attempt to narrow/widen the shuffle mask to the correct size.
32215 if (Mask.size() != NumSrcElts) {
32216 if ((NumSrcElts % Mask.size()) == 0) {
32217 SmallVector<int, 16> ScaledMask;
32218 int Scale = NumSrcElts / Mask.size();
32219 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
32220 Mask = std::move(ScaledMask);
32221 } else if ((Mask.size() % NumSrcElts) == 0) {
32222 SmallVector<int, 16> WidenedMask;
32223 while (Mask.size() > NumSrcElts &&
32224 canWidenShuffleElements(Mask, WidenedMask))
32225 Mask = std::move(WidenedMask);
32226 // TODO - investigate support for wider shuffle masks with known upper
32227 // undef/zero elements for implicit zero-extension.
32231 // Check if narrowing/widening failed.
32232 if (Mask.size() != NumSrcElts)
32235 int SrcIdx = Mask[N->getConstantOperandVal(1)];
32238 // If the shuffle source element is undef/zero then we can just accept it.
32239 if (SrcIdx == SM_SentinelUndef)
32240 return DAG.getUNDEF(VT);
32242 if (SrcIdx == SM_SentinelZero)
32243 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
32244 : DAG.getConstant(0, dl, VT);
32246 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
32247 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
32248 SrcIdx = SrcIdx % Mask.size();
32250 // We can only extract other elements from 128-bit vectors and in certain
32251 // circumstances, depending on SSE-level.
32252 // TODO: Investigate using extract_subvector for larger vectors.
32253 // TODO: Investigate float/double extraction if it will be just stored.
32254 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
32255 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
32256 assert(SrcSVT == VT && "Unexpected extraction type");
32257 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
32258 DAG.getIntPtrConstant(SrcIdx, dl));
32261 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
32262 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
32263 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
32264 "Unexpected extraction type");
32265 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
32266 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
32267 DAG.getIntPtrConstant(SrcIdx, dl));
32268 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
32274 /// Detect vector gather/scatter index generation and convert it from being a
32275 /// bunch of shuffles and extracts into a somewhat faster sequence.
32276 /// For i686, the best sequence is apparently storing the value and loading
32277 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
32278 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
32279 TargetLowering::DAGCombinerInfo &DCI,
32280 const X86Subtarget &Subtarget) {
32281 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
32284 // TODO - Remove this once we can handle the implicit zero-extension of
32285 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
32286 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
32287 // combineBasicSADPattern.
32288 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
32291 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
32294 SDValue InputVector = N->getOperand(0);
32295 SDValue EltIdx = N->getOperand(1);
32297 EVT SrcVT = InputVector.getValueType();
32298 EVT VT = N->getValueType(0);
32299 SDLoc dl(InputVector);
32301 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
32302 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32303 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
32304 SDValue MMXSrc = InputVector.getOperand(0);
32306 // The bitcast source is a direct mmx result.
32307 if (MMXSrc.getValueType() == MVT::x86mmx)
32308 return DAG.getBitcast(VT, InputVector);
32311 // Detect mmx to i32 conversion through a v2i32 elt extract.
32312 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32313 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
32314 SDValue MMXSrc = InputVector.getOperand(0);
32316 // The bitcast source is a direct mmx result.
32317 if (MMXSrc.getValueType() == MVT::x86mmx)
32318 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
32321 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
32322 isa<ConstantSDNode>(EltIdx) &&
32323 isa<ConstantSDNode>(InputVector.getOperand(0))) {
32324 uint64_t ExtractedElt = N->getConstantOperandVal(1);
32325 auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
32326 const APInt &InputValue = InputC->getAPIntValue();
32327 uint64_t Res = InputValue[ExtractedElt];
32328 return DAG.getConstant(Res, dl, MVT::i1);
32331 // Check whether this extract is the root of a sum of absolute differences
32332 // pattern. This has to be done here because we really want it to happen
32333 // pre-legalization,
32334 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
32337 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
32338 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
32341 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
32342 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
32348 /// If a vector select has an operand that is -1 or 0, try to simplify the
32349 /// select to a bitwise logic operation.
32350 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
32352 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
32353 TargetLowering::DAGCombinerInfo &DCI,
32354 const X86Subtarget &Subtarget) {
32355 SDValue Cond = N->getOperand(0);
32356 SDValue LHS = N->getOperand(1);
32357 SDValue RHS = N->getOperand(2);
32358 EVT VT = LHS.getValueType();
32359 EVT CondVT = Cond.getValueType();
32361 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32363 if (N->getOpcode() != ISD::VSELECT)
32366 assert(CondVT.isVector() && "Vector select expects a vector selector!");
32368 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
32369 // Check if the first operand is all zeros and Cond type is vXi1.
32370 // This situation only applies to avx512.
32371 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
32372 CondVT.getVectorElementType() == MVT::i1) {
32373 // Invert the cond to not(cond) : xor(op,allones)=not(op)
32374 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
32375 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
32376 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
32379 // To use the condition operand as a bitwise mask, it must have elements that
32380 // are the same size as the select elements. Ie, the condition operand must
32381 // have already been promoted from the IR select condition type <N x i1>.
32382 // Don't check if the types themselves are equal because that excludes
32383 // vector floating-point selects.
32384 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
32387 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
32388 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
32390 // Try to invert the condition if true value is not all 1s and false value is
32392 if (!TValIsAllOnes && !FValIsAllZeros &&
32393 // Check if the selector will be produced by CMPP*/PCMP*.
32394 Cond.getOpcode() == ISD::SETCC &&
32395 // Check if SETCC has already been promoted.
32396 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
32398 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
32400 if (TValIsAllZeros || FValIsAllOnes) {
32401 SDValue CC = Cond.getOperand(2);
32402 ISD::CondCode NewCC =
32403 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
32404 Cond.getOperand(0).getValueType().isInteger());
32405 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
32407 std::swap(LHS, RHS);
32408 TValIsAllOnes = FValIsAllOnes;
32409 FValIsAllZeros = TValIsAllZeros;
32413 // Cond value must be 'sign splat' to be converted to a logical op.
32414 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
32417 // vselect Cond, 111..., 000... -> Cond
32418 if (TValIsAllOnes && FValIsAllZeros)
32419 return DAG.getBitcast(VT, Cond);
32421 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
32424 // vselect Cond, 111..., X -> or Cond, X
32425 if (TValIsAllOnes) {
32426 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
32427 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
32428 return DAG.getBitcast(VT, Or);
32431 // vselect Cond, X, 000... -> and Cond, X
32432 if (FValIsAllZeros) {
32433 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
32434 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
32435 return DAG.getBitcast(VT, And);
32438 // vselect Cond, 000..., X -> andn Cond, X
32439 if (TValIsAllZeros) {
32440 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
32441 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
32442 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
32443 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
32444 return DAG.getBitcast(VT, AndN);
32450 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
32451 SDValue Cond = N->getOperand(0);
32452 SDValue LHS = N->getOperand(1);
32453 SDValue RHS = N->getOperand(2);
32456 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
32457 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
32458 if (!TrueC || !FalseC)
32461 // Don't do this for crazy integer types.
32462 EVT VT = N->getValueType(0);
32463 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32466 // We're going to use the condition bit in math or logic ops. We could allow
32467 // this with a wider condition value (post-legalization it becomes an i8),
32468 // but if nothing is creating selects that late, it doesn't matter.
32469 if (Cond.getValueType() != MVT::i1)
32472 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
32473 // 3, 5, or 9 with i32/i64, so those get transformed too.
32474 // TODO: For constants that overflow or do not differ by power-of-2 or small
32475 // multiplier, convert to 'and' + 'add'.
32476 const APInt &TrueVal = TrueC->getAPIntValue();
32477 const APInt &FalseVal = FalseC->getAPIntValue();
32479 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
32483 APInt AbsDiff = Diff.abs();
32484 if (AbsDiff.isPowerOf2() ||
32485 ((VT == MVT::i32 || VT == MVT::i64) &&
32486 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
32488 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
32489 // of the condition can usually be folded into a compare predicate, but even
32490 // without that, the sequence should be cheaper than a CMOV alternative.
32491 if (TrueVal.slt(FalseVal)) {
32492 Cond = DAG.getNOT(DL, Cond, MVT::i1);
32493 std::swap(TrueC, FalseC);
32496 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
32497 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
32499 // Multiply condition by the difference if non-one.
32500 if (!AbsDiff.isOneValue())
32501 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
32503 // Add the base if non-zero.
32504 if (!FalseC->isNullValue())
32505 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
32513 /// If this is a *dynamic* select (non-constant condition) and we can match
32514 /// this node with one of the variable blend instructions, restructure the
32515 /// condition so that blends can use the high (sign) bit of each element.
32516 static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
32517 TargetLowering::DAGCombinerInfo &DCI,
32518 const X86Subtarget &Subtarget) {
32519 SDValue Cond = N->getOperand(0);
32520 if (N->getOpcode() != ISD::VSELECT ||
32521 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
32524 // Don't optimize before the condition has been transformed to a legal type
32525 // and don't ever optimize vector selects that map to AVX512 mask-registers.
32526 unsigned BitWidth = Cond.getScalarValueSizeInBits();
32527 if (BitWidth < 8 || BitWidth > 64)
32530 // We can only handle the cases where VSELECT is directly legal on the
32531 // subtarget. We custom lower VSELECT nodes with constant conditions and
32532 // this makes it hard to see whether a dynamic VSELECT will correctly
32533 // lower, so we both check the operation's status and explicitly handle the
32534 // cases where a *dynamic* blend will fail even though a constant-condition
32535 // blend could be custom lowered.
32536 // FIXME: We should find a better way to handle this class of problems.
32537 // Potentially, we should combine constant-condition vselect nodes
32538 // pre-legalization into shuffles and not mark as many types as custom
32540 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32541 EVT VT = N->getValueType(0);
32542 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
32544 // FIXME: We don't support i16-element blends currently. We could and
32545 // should support them by making *all* the bits in the condition be set
32546 // rather than just the high bit and using an i8-element blend.
32547 if (VT.getVectorElementType() == MVT::i16)
32549 // Dynamic blending was only available from SSE4.1 onward.
32550 if (VT.is128BitVector() && !Subtarget.hasSSE41())
32552 // Byte blends are only available in AVX2
32553 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
32555 // There are no 512-bit blend instructions that use sign bits.
32556 if (VT.is512BitVector())
32559 // TODO: Add other opcodes eventually lowered into BLEND.
32560 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
32562 if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
32565 APInt DemandedMask(APInt::getSignMask(BitWidth));
32567 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32568 !DCI.isBeforeLegalizeOps());
32569 if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
32572 // If we changed the computation somewhere in the DAG, this change will
32573 // affect all users of Cond. Update all the nodes so that we do not use
32574 // the generic VSELECT anymore. Otherwise, we may perform wrong
32575 // optimizations as we messed with the actual expectation for the vector
32577 for (SDNode *U : Cond->uses()) {
32578 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0),
32579 Cond, U->getOperand(1), U->getOperand(2));
32580 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
32582 DCI.CommitTargetLoweringOpt(TLO);
32583 return SDValue(N, 0);
32586 /// Do target-specific dag combines on SELECT and VSELECT nodes.
32587 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
32588 TargetLowering::DAGCombinerInfo &DCI,
32589 const X86Subtarget &Subtarget) {
32591 SDValue Cond = N->getOperand(0);
32592 // Get the LHS/RHS of the select.
32593 SDValue LHS = N->getOperand(1);
32594 SDValue RHS = N->getOperand(2);
32595 EVT VT = LHS.getValueType();
32596 EVT CondVT = Cond.getValueType();
32597 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32599 // Convert vselects with constant condition into shuffles.
32600 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
32601 DCI.isBeforeLegalizeOps()) {
32602 SmallVector<int, 64> Mask(VT.getVectorNumElements(), -1);
32603 for (int i = 0, Size = Mask.size(); i != Size; ++i) {
32604 SDValue CondElt = Cond->getOperand(i);
32606 // Arbitrarily choose from the 2nd operand if the select condition element
32608 // TODO: Can we do better by matching patterns such as even/odd?
32609 if (CondElt.isUndef() || isNullConstant(CondElt))
32613 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
32616 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
32617 // instructions match the semantics of the common C idiom x<y?x:y but not
32618 // x<=y?x:y, because of how they handle negative zero (which can be
32619 // ignored in unsafe-math mode).
32620 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
32621 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
32622 VT != MVT::f80 && VT != MVT::f128 &&
32623 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
32624 (Subtarget.hasSSE2() ||
32625 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
32626 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32628 unsigned Opcode = 0;
32629 // Check for x CC y ? x : y.
32630 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32631 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32635 // Converting this to a min would handle NaNs incorrectly, and swapping
32636 // the operands would cause it to handle comparisons between positive
32637 // and negative zero incorrectly.
32638 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32639 if (!DAG.getTarget().Options.UnsafeFPMath &&
32640 !(DAG.isKnownNeverZeroFloat(LHS) ||
32641 DAG.isKnownNeverZeroFloat(RHS)))
32643 std::swap(LHS, RHS);
32645 Opcode = X86ISD::FMIN;
32648 // Converting this to a min would handle comparisons between positive
32649 // and negative zero incorrectly.
32650 if (!DAG.getTarget().Options.UnsafeFPMath &&
32651 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
32653 Opcode = X86ISD::FMIN;
32656 // Converting this to a min would handle both negative zeros and NaNs
32657 // incorrectly, but we can swap the operands to fix both.
32658 std::swap(LHS, RHS);
32663 Opcode = X86ISD::FMIN;
32667 // Converting this to a max would handle comparisons between positive
32668 // and negative zero incorrectly.
32669 if (!DAG.getTarget().Options.UnsafeFPMath &&
32670 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
32672 Opcode = X86ISD::FMAX;
32675 // Converting this to a max would handle NaNs incorrectly, and swapping
32676 // the operands would cause it to handle comparisons between positive
32677 // and negative zero incorrectly.
32678 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32679 if (!DAG.getTarget().Options.UnsafeFPMath &&
32680 !(DAG.isKnownNeverZeroFloat(LHS) ||
32681 DAG.isKnownNeverZeroFloat(RHS)))
32683 std::swap(LHS, RHS);
32685 Opcode = X86ISD::FMAX;
32688 // Converting this to a max would handle both negative zeros and NaNs
32689 // incorrectly, but we can swap the operands to fix both.
32690 std::swap(LHS, RHS);
32695 Opcode = X86ISD::FMAX;
32698 // Check for x CC y ? y : x -- a min/max with reversed arms.
32699 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
32700 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
32704 // Converting this to a min would handle comparisons between positive
32705 // and negative zero incorrectly, and swapping the operands would
32706 // cause it to handle NaNs incorrectly.
32707 if (!DAG.getTarget().Options.UnsafeFPMath &&
32708 !(DAG.isKnownNeverZeroFloat(LHS) ||
32709 DAG.isKnownNeverZeroFloat(RHS))) {
32710 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32712 std::swap(LHS, RHS);
32714 Opcode = X86ISD::FMIN;
32717 // Converting this to a min would handle NaNs incorrectly.
32718 if (!DAG.getTarget().Options.UnsafeFPMath &&
32719 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
32721 Opcode = X86ISD::FMIN;
32724 // Converting this to a min would handle both negative zeros and NaNs
32725 // incorrectly, but we can swap the operands to fix both.
32726 std::swap(LHS, RHS);
32731 Opcode = X86ISD::FMIN;
32735 // Converting this to a max would handle NaNs incorrectly.
32736 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32738 Opcode = X86ISD::FMAX;
32741 // Converting this to a max would handle comparisons between positive
32742 // and negative zero incorrectly, and swapping the operands would
32743 // cause it to handle NaNs incorrectly.
32744 if (!DAG.getTarget().Options.UnsafeFPMath &&
32745 !DAG.isKnownNeverZeroFloat(LHS) &&
32746 !DAG.isKnownNeverZeroFloat(RHS)) {
32747 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32749 std::swap(LHS, RHS);
32751 Opcode = X86ISD::FMAX;
32754 // Converting this to a max would handle both negative zeros and NaNs
32755 // incorrectly, but we can swap the operands to fix both.
32756 std::swap(LHS, RHS);
32761 Opcode = X86ISD::FMAX;
32767 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
32770 // Some mask scalar intrinsics rely on checking if only one bit is set
32771 // and implement it in C code like this:
32772 // A[0] = (U & 1) ? A[0] : W[0];
32773 // This creates some redundant instructions that break pattern matching.
32774 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
32775 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
32776 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
32777 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32778 SDValue AndNode = Cond.getOperand(0);
32779 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
32780 isNullConstant(Cond.getOperand(1)) &&
32781 isOneConstant(AndNode.getOperand(1))) {
32782 // LHS and RHS swapped due to
32783 // setcc outputting 1 when AND resulted in 0 and vice versa.
32784 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
32785 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
32789 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
32790 // lowering on KNL. In this case we convert it to
32791 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
32792 // The same situation all vectors of i8 and i16 without BWI.
32793 // Make sure we extend these even before type legalization gets a chance to
32794 // split wide vectors.
32795 // Since SKX these selects have a proper lowering.
32796 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
32797 CondVT.getVectorElementType() == MVT::i1 &&
32798 VT.getVectorNumElements() > 4 &&
32799 (VT.getVectorElementType() == MVT::i8 ||
32800 VT.getVectorElementType() == MVT::i16)) {
32801 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
32802 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
32805 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
32808 // Canonicalize max and min:
32809 // (x > y) ? x : y -> (x >= y) ? x : y
32810 // (x < y) ? x : y -> (x <= y) ? x : y
32811 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
32812 // the need for an extra compare
32813 // against zero. e.g.
32814 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
32816 // testl %edi, %edi
32818 // cmovgl %edi, %eax
32822 // cmovsl %eax, %edi
32823 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
32824 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32825 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32826 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32831 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
32832 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
32833 Cond.getOperand(0), Cond.getOperand(1), NewCC);
32834 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
32839 // Early exit check
32840 if (!TLI.isTypeLegal(VT))
32843 // Match VSELECTs into subs with unsigned saturation.
32844 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
32845 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
32846 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
32847 (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
32848 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32850 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
32851 // left side invert the predicate to simplify logic below.
32853 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
32855 CC = ISD::getSetCCInverse(CC, true);
32856 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
32860 if (Other.getNode() && Other->getNumOperands() == 2 &&
32861 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
32862 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
32863 SDValue CondRHS = Cond->getOperand(1);
32865 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
32866 ArrayRef<SDValue> Ops) {
32867 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
32870 // Look for a general sub with unsigned saturation first.
32871 // x >= y ? x-y : 0 --> subus x, y
32872 // x > y ? x-y : 0 --> subus x, y
32873 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
32874 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
32875 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32878 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
32879 if (isa<BuildVectorSDNode>(CondRHS)) {
32880 // If the RHS is a constant we have to reverse the const
32881 // canonicalization.
32882 // x > C-1 ? x+-C : 0 --> subus x, C
32883 auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
32884 return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
32886 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
32887 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
32888 OpRHS = DAG.getNode(ISD::SUB, DL, VT,
32889 DAG.getConstant(0, DL, VT), OpRHS);
32890 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32894 // Another special case: If C was a sign bit, the sub has been
32895 // canonicalized into a xor.
32896 // FIXME: Would it be better to use computeKnownBits to determine
32897 // whether it's safe to decanonicalize the xor?
32898 // x s< 0 ? x^C : 0 --> subus x, C
32899 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode())
32900 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
32901 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
32902 OpRHSConst->getAPIntValue().isSignMask()) {
32903 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
32904 // Note that we have to rebuild the RHS constant here to ensure we
32905 // don't rely on particular values of undef lanes.
32906 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32913 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
32916 if (SDValue V = combineVSelectToShrunkBlend(N, DAG, DCI, Subtarget))
32919 // Custom action for SELECT MMX
32920 if (VT == MVT::x86mmx) {
32921 LHS = DAG.getBitcast(MVT::i64, LHS);
32922 RHS = DAG.getBitcast(MVT::i64, RHS);
32923 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
32924 return DAG.getBitcast(VT, newSelect);
32931 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
32933 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
32934 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
32935 /// Note that this is only legal for some op/cc combinations.
32936 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
32938 const X86Subtarget &Subtarget) {
32939 // This combine only operates on CMP-like nodes.
32940 if (!(Cmp.getOpcode() == X86ISD::CMP ||
32941 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
32944 // Can't replace the cmp if it has more uses than the one we're looking at.
32945 // FIXME: We would like to be able to handle this, but would need to make sure
32946 // all uses were updated.
32947 if (!Cmp.hasOneUse())
32950 // This only applies to variations of the common case:
32951 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
32952 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
32953 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
32954 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
32955 // Using the proper condcodes (see below), overflow is checked for.
32957 // FIXME: We can generalize both constraints:
32958 // - XOR/OR/AND (if they were made to survive AtomicExpand)
32960 // if the result is compared.
32962 SDValue CmpLHS = Cmp.getOperand(0);
32963 SDValue CmpRHS = Cmp.getOperand(1);
32965 if (!CmpLHS.hasOneUse())
32968 unsigned Opc = CmpLHS.getOpcode();
32969 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
32972 SDValue OpRHS = CmpLHS.getOperand(2);
32973 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
32977 APInt Addend = OpRHSC->getAPIntValue();
32978 if (Opc == ISD::ATOMIC_LOAD_SUB)
32981 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
32985 APInt Comparison = CmpRHSC->getAPIntValue();
32987 // If the addend is the negation of the comparison value, then we can do
32988 // a full comparison by emitting the atomic arithmetic as a locked sub.
32989 if (Comparison == -Addend) {
32990 // The CC is fine, but we need to rewrite the LHS of the comparison as an
32992 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
32993 auto AtomicSub = DAG.getAtomic(
32994 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
32995 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
32996 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
32997 AN->getMemOperand());
32998 // If the comparision uses the CF flag we can't use INC/DEC instructions.
32999 bool NeedCF = false;
33002 case X86::COND_A: case X86::COND_AE:
33003 case X86::COND_B: case X86::COND_BE:
33007 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
33008 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
33009 DAG.getUNDEF(CmpLHS.getValueType()));
33010 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
33014 // We can handle comparisons with zero in a number of cases by manipulating
33016 if (!Comparison.isNullValue())
33019 if (CC == X86::COND_S && Addend == 1)
33021 else if (CC == X86::COND_NS && Addend == 1)
33023 else if (CC == X86::COND_G && Addend == -1)
33025 else if (CC == X86::COND_LE && Addend == -1)
33030 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
33031 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
33032 DAG.getUNDEF(CmpLHS.getValueType()));
33033 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
33037 // Check whether a boolean test is testing a boolean value generated by
33038 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
33041 // Simplify the following patterns:
33042 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
33043 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
33044 // to (Op EFLAGS Cond)
33046 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
33047 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
33048 // to (Op EFLAGS !Cond)
33050 // where Op could be BRCOND or CMOV.
33052 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
33053 // This combine only operates on CMP-like nodes.
33054 if (!(Cmp.getOpcode() == X86ISD::CMP ||
33055 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
33058 // Quit if not used as a boolean value.
33059 if (CC != X86::COND_E && CC != X86::COND_NE)
33062 // Check CMP operands. One of them should be 0 or 1 and the other should be
33063 // an SetCC or extended from it.
33064 SDValue Op1 = Cmp.getOperand(0);
33065 SDValue Op2 = Cmp.getOperand(1);
33068 const ConstantSDNode* C = nullptr;
33069 bool needOppositeCond = (CC == X86::COND_E);
33070 bool checkAgainstTrue = false; // Is it a comparison against 1?
33072 if ((C = dyn_cast<ConstantSDNode>(Op1)))
33074 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
33076 else // Quit if all operands are not constants.
33079 if (C->getZExtValue() == 1) {
33080 needOppositeCond = !needOppositeCond;
33081 checkAgainstTrue = true;
33082 } else if (C->getZExtValue() != 0)
33083 // Quit if the constant is neither 0 or 1.
33086 bool truncatedToBoolWithAnd = false;
33087 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
33088 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
33089 SetCC.getOpcode() == ISD::TRUNCATE ||
33090 SetCC.getOpcode() == ISD::AND) {
33091 if (SetCC.getOpcode() == ISD::AND) {
33093 if (isOneConstant(SetCC.getOperand(0)))
33095 if (isOneConstant(SetCC.getOperand(1)))
33099 SetCC = SetCC.getOperand(OpIdx);
33100 truncatedToBoolWithAnd = true;
33102 SetCC = SetCC.getOperand(0);
33105 switch (SetCC.getOpcode()) {
33106 case X86ISD::SETCC_CARRY:
33107 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
33108 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
33109 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
33110 // truncated to i1 using 'and'.
33111 if (checkAgainstTrue && !truncatedToBoolWithAnd)
33113 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
33114 "Invalid use of SETCC_CARRY!");
33116 case X86ISD::SETCC:
33117 // Set the condition code or opposite one if necessary.
33118 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
33119 if (needOppositeCond)
33120 CC = X86::GetOppositeBranchCondition(CC);
33121 return SetCC.getOperand(1);
33122 case X86ISD::CMOV: {
33123 // Check whether false/true value has canonical one, i.e. 0 or 1.
33124 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
33125 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
33126 // Quit if true value is not a constant.
33129 // Quit if false value is not a constant.
33131 SDValue Op = SetCC.getOperand(0);
33132 // Skip 'zext' or 'trunc' node.
33133 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
33134 Op.getOpcode() == ISD::TRUNCATE)
33135 Op = Op.getOperand(0);
33136 // A special case for rdrand/rdseed, where 0 is set if false cond is
33138 if ((Op.getOpcode() != X86ISD::RDRAND &&
33139 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
33142 // Quit if false value is not the constant 0 or 1.
33143 bool FValIsFalse = true;
33144 if (FVal && FVal->getZExtValue() != 0) {
33145 if (FVal->getZExtValue() != 1)
33147 // If FVal is 1, opposite cond is needed.
33148 needOppositeCond = !needOppositeCond;
33149 FValIsFalse = false;
33151 // Quit if TVal is not the constant opposite of FVal.
33152 if (FValIsFalse && TVal->getZExtValue() != 1)
33154 if (!FValIsFalse && TVal->getZExtValue() != 0)
33156 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
33157 if (needOppositeCond)
33158 CC = X86::GetOppositeBranchCondition(CC);
33159 return SetCC.getOperand(3);
33166 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
33168 /// (X86or (X86setcc) (X86setcc))
33169 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
33170 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
33171 X86::CondCode &CC1, SDValue &Flags,
33173 if (Cond->getOpcode() == X86ISD::CMP) {
33174 if (!isNullConstant(Cond->getOperand(1)))
33177 Cond = Cond->getOperand(0);
33182 SDValue SetCC0, SetCC1;
33183 switch (Cond->getOpcode()) {
33184 default: return false;
33191 SetCC0 = Cond->getOperand(0);
33192 SetCC1 = Cond->getOperand(1);
33196 // Make sure we have SETCC nodes, using the same flags value.
33197 if (SetCC0.getOpcode() != X86ISD::SETCC ||
33198 SetCC1.getOpcode() != X86ISD::SETCC ||
33199 SetCC0->getOperand(1) != SetCC1->getOperand(1))
33202 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
33203 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
33204 Flags = SetCC0->getOperand(1);
33208 // When legalizing carry, we create carries via add X, -1
33209 // If that comes from an actual carry, via setcc, we use the
33211 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
33212 if (EFLAGS.getOpcode() == X86ISD::ADD) {
33213 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
33214 SDValue Carry = EFLAGS.getOperand(0);
33215 while (Carry.getOpcode() == ISD::TRUNCATE ||
33216 Carry.getOpcode() == ISD::ZERO_EXTEND ||
33217 Carry.getOpcode() == ISD::SIGN_EXTEND ||
33218 Carry.getOpcode() == ISD::ANY_EXTEND ||
33219 (Carry.getOpcode() == ISD::AND &&
33220 isOneConstant(Carry.getOperand(1))))
33221 Carry = Carry.getOperand(0);
33222 if (Carry.getOpcode() == X86ISD::SETCC ||
33223 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
33224 if (Carry.getConstantOperandVal(0) == X86::COND_B)
33225 return Carry.getOperand(1);
33233 /// Optimize an EFLAGS definition used according to the condition code \p CC
33234 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
33235 /// uses of chain values.
33236 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
33238 const X86Subtarget &Subtarget) {
33239 if (CC == X86::COND_B)
33240 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
33243 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
33245 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
33248 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
33249 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
33250 TargetLowering::DAGCombinerInfo &DCI,
33251 const X86Subtarget &Subtarget) {
33254 SDValue FalseOp = N->getOperand(0);
33255 SDValue TrueOp = N->getOperand(1);
33256 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
33257 SDValue Cond = N->getOperand(3);
33259 // Try to simplify the EFLAGS and condition code operands.
33260 // We can't always do this as FCMOV only supports a subset of X86 cond.
33261 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
33262 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
33263 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
33265 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33269 // If this is a select between two integer constants, try to do some
33270 // optimizations. Note that the operands are ordered the opposite of SELECT
33272 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
33273 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
33274 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
33275 // larger than FalseC (the false value).
33276 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
33277 CC = X86::GetOppositeBranchCondition(CC);
33278 std::swap(TrueC, FalseC);
33279 std::swap(TrueOp, FalseOp);
33282 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
33283 // This is efficient for any integer data type (including i8/i16) and
33285 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
33286 Cond = getSETCC(CC, Cond, DL, DAG);
33288 // Zero extend the condition if needed.
33289 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
33291 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
33292 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
33293 DAG.getConstant(ShAmt, DL, MVT::i8));
33297 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
33298 // for any integer data type, including i8/i16.
33299 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
33300 Cond = getSETCC(CC, Cond, DL, DAG);
33302 // Zero extend the condition if needed.
33303 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
33304 FalseC->getValueType(0), Cond);
33305 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33306 SDValue(FalseC, 0));
33310 // Optimize cases that will turn into an LEA instruction. This requires
33311 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
33312 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
33313 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
33314 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
33316 bool isFastMultiplier = false;
33318 switch ((unsigned char)Diff) {
33320 case 1: // result = add base, cond
33321 case 2: // result = lea base( , cond*2)
33322 case 3: // result = lea base(cond, cond*2)
33323 case 4: // result = lea base( , cond*4)
33324 case 5: // result = lea base(cond, cond*4)
33325 case 8: // result = lea base( , cond*8)
33326 case 9: // result = lea base(cond, cond*8)
33327 isFastMultiplier = true;
33332 if (isFastMultiplier) {
33333 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
33334 Cond = getSETCC(CC, Cond, DL ,DAG);
33335 // Zero extend the condition if needed.
33336 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
33338 // Scale the condition by the difference.
33340 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
33341 DAG.getConstant(Diff, DL, Cond.getValueType()));
33343 // Add the base if non-zero.
33344 if (FalseC->getAPIntValue() != 0)
33345 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33346 SDValue(FalseC, 0));
33353 // Handle these cases:
33354 // (select (x != c), e, c) -> select (x != c), e, x),
33355 // (select (x == c), c, e) -> select (x == c), x, e)
33356 // where the c is an integer constant, and the "select" is the combination
33357 // of CMOV and CMP.
33359 // The rationale for this change is that the conditional-move from a constant
33360 // needs two instructions, however, conditional-move from a register needs
33361 // only one instruction.
33363 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
33364 // some instruction-combining opportunities. This opt needs to be
33365 // postponed as late as possible.
33367 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
33368 // the DCI.xxxx conditions are provided to postpone the optimization as
33369 // late as possible.
33371 ConstantSDNode *CmpAgainst = nullptr;
33372 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
33373 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
33374 !isa<ConstantSDNode>(Cond.getOperand(0))) {
33376 if (CC == X86::COND_NE &&
33377 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
33378 CC = X86::GetOppositeBranchCondition(CC);
33379 std::swap(TrueOp, FalseOp);
33382 if (CC == X86::COND_E &&
33383 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
33384 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
33385 DAG.getConstant(CC, DL, MVT::i8), Cond };
33386 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33391 // Fold and/or of setcc's to double CMOV:
33392 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
33393 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
33395 // This combine lets us generate:
33396 // cmovcc1 (jcc1 if we don't have CMOV)
33402 // cmovne (jne if we don't have CMOV)
33403 // When we can't use the CMOV instruction, it might increase branch
33405 // When we can use CMOV, or when there is no mispredict, this improves
33406 // throughput and reduces register pressure.
33408 if (CC == X86::COND_NE) {
33410 X86::CondCode CC0, CC1;
33412 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
33414 std::swap(FalseOp, TrueOp);
33415 CC0 = X86::GetOppositeBranchCondition(CC0);
33416 CC1 = X86::GetOppositeBranchCondition(CC1);
33419 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
33421 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
33422 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
33423 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33428 // Handle (CMOV C-1, (ADD (CTTZ X), C), (X != 0)) ->
33429 // (ADD (CMOV (CTTZ X), -1, (X != 0)), C) or
33430 // (CMOV (ADD (CTTZ X), C), C-1, (X == 0)) ->
33431 // (ADD (CMOV C-1, (CTTZ X), (X == 0)), C)
33432 if (CC == X86::COND_NE || CC == X86::COND_E) {
33433 auto *Cnst = CC == X86::COND_E ? dyn_cast<ConstantSDNode>(TrueOp)
33434 : dyn_cast<ConstantSDNode>(FalseOp);
33435 SDValue Add = CC == X86::COND_E ? FalseOp : TrueOp;
33437 if (Cnst && Add.getOpcode() == ISD::ADD && Add.hasOneUse()) {
33438 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
33439 SDValue AddOp2 = Add.getOperand(0);
33440 if (AddOp1 && (AddOp2.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
33441 AddOp2.getOpcode() == ISD::CTTZ)) {
33442 APInt Diff = Cnst->getAPIntValue() - AddOp1->getAPIntValue();
33443 if (CC == X86::COND_E) {
33444 Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(), AddOp2,
33445 DAG.getConstant(Diff, DL, Add.getValueType()),
33446 DAG.getConstant(CC, DL, MVT::i8), Cond);
33448 Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(),
33449 DAG.getConstant(Diff, DL, Add.getValueType()),
33450 AddOp2, DAG.getConstant(CC, DL, MVT::i8), Cond);
33452 return DAG.getNode(X86ISD::ADD, DL, Add.getValueType(), Add,
33453 SDValue(AddOp1, 0));
33461 /// Different mul shrinking modes.
33462 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
33464 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
33465 EVT VT = N->getOperand(0).getValueType();
33466 if (VT.getScalarSizeInBits() != 32)
33469 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
33470 unsigned SignBits[2] = {1, 1};
33471 bool IsPositive[2] = {false, false};
33472 for (unsigned i = 0; i < 2; i++) {
33473 SDValue Opd = N->getOperand(i);
33475 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
33476 // compute signbits for it separately.
33477 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
33478 // For anyextend, it is safe to assume an appropriate number of leading
33480 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
33482 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
33487 IsPositive[i] = true;
33488 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
33489 // All the operands of BUILD_VECTOR need to be int constant.
33490 // Find the smallest value range which all the operands belong to.
33492 IsPositive[i] = true;
33493 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
33494 if (SubOp.isUndef())
33496 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
33499 APInt IntVal = CN->getAPIntValue();
33500 if (IntVal.isNegative())
33501 IsPositive[i] = false;
33502 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
33505 SignBits[i] = DAG.ComputeNumSignBits(Opd);
33506 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
33507 IsPositive[i] = true;
33511 bool AllPositive = IsPositive[0] && IsPositive[1];
33512 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
33513 // When ranges are from -128 ~ 127, use MULS8 mode.
33514 if (MinSignBits >= 25)
33516 // When ranges are from 0 ~ 255, use MULU8 mode.
33517 else if (AllPositive && MinSignBits >= 24)
33519 // When ranges are from -32768 ~ 32767, use MULS16 mode.
33520 else if (MinSignBits >= 17)
33522 // When ranges are from 0 ~ 65535, use MULU16 mode.
33523 else if (AllPositive && MinSignBits >= 16)
33530 /// When the operands of vector mul are extended from smaller size values,
33531 /// like i8 and i16, the type of mul may be shrinked to generate more
33532 /// efficient code. Two typical patterns are handled:
33534 /// %2 = sext/zext <N x i8> %1 to <N x i32>
33535 /// %4 = sext/zext <N x i8> %3 to <N x i32>
33536 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33537 /// %5 = mul <N x i32> %2, %4
33540 /// %2 = zext/sext <N x i16> %1 to <N x i32>
33541 /// %4 = zext/sext <N x i16> %3 to <N x i32>
33542 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33543 /// %5 = mul <N x i32> %2, %4
33545 /// There are four mul shrinking modes:
33546 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
33547 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
33548 /// generate pmullw+sext32 for it (MULS8 mode).
33549 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
33550 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
33551 /// generate pmullw+zext32 for it (MULU8 mode).
33552 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
33553 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
33554 /// generate pmullw+pmulhw for it (MULS16 mode).
33555 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
33556 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
33557 /// generate pmullw+pmulhuw for it (MULU16 mode).
33558 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
33559 const X86Subtarget &Subtarget) {
33560 // Check for legality
33561 // pmullw/pmulhw are not supported by SSE.
33562 if (!Subtarget.hasSSE2())
33565 // Check for profitability
33566 // pmulld is supported since SSE41. It is better to use pmulld
33567 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
33569 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
33570 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
33574 if (!canReduceVMulWidth(N, DAG, Mode))
33578 SDValue N0 = N->getOperand(0);
33579 SDValue N1 = N->getOperand(1);
33580 EVT VT = N->getOperand(0).getValueType();
33581 unsigned NumElts = VT.getVectorNumElements();
33582 if ((NumElts % 2) != 0)
33585 unsigned RegSize = 128;
33586 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
33587 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
33589 // Shrink the operands of mul.
33590 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
33591 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
33593 if (NumElts >= OpsVT.getVectorNumElements()) {
33594 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
33595 // lower part is needed.
33596 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
33597 if (Mode == MULU8 || Mode == MULS8) {
33598 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
33601 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
33602 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
33603 // the higher part is also needed.
33604 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33605 ReducedVT, NewN0, NewN1);
33607 // Repack the lower part and higher part result of mul into a wider
33609 // Generate shuffle functioning as punpcklwd.
33610 SmallVector<int, 16> ShuffleMask(NumElts);
33611 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33612 ShuffleMask[2 * i] = i;
33613 ShuffleMask[2 * i + 1] = i + NumElts;
33616 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33617 ResLo = DAG.getBitcast(ResVT, ResLo);
33618 // Generate shuffle functioning as punpckhwd.
33619 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33620 ShuffleMask[2 * i] = i + NumElts / 2;
33621 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
33624 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33625 ResHi = DAG.getBitcast(ResVT, ResHi);
33626 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
33629 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
33630 // to legalize the mul explicitly because implicit legalization for type
33631 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
33632 // instructions which will not exist when we explicitly legalize it by
33633 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
33634 // <4 x i16> undef).
33636 // Legalize the operands of mul.
33637 // FIXME: We may be able to handle non-concatenated vectors by insertion.
33638 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
33639 if ((RegSize % ReducedSizeInBits) != 0)
33642 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
33643 DAG.getUNDEF(ReducedVT));
33645 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33647 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33649 if (Mode == MULU8 || Mode == MULS8) {
33650 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
33652 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33654 // convert the type of mul result to VT.
33655 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33656 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
33657 : ISD::SIGN_EXTEND_VECTOR_INREG,
33659 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33660 DAG.getIntPtrConstant(0, DL));
33662 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
33663 // MULU16/MULS16, both parts are needed.
33664 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33665 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33666 OpsVT, NewN0, NewN1);
33668 // Repack the lower part and higher part result of mul into a wider
33669 // result. Make sure the type of mul result is VT.
33670 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33671 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
33672 Res = DAG.getBitcast(ResVT, Res);
33673 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33674 DAG.getIntPtrConstant(0, DL));
33679 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
33680 EVT VT, const SDLoc &DL) {
33682 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
33683 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33684 DAG.getConstant(Mult, DL, VT));
33685 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
33686 DAG.getConstant(Shift, DL, MVT::i8));
33687 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33692 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
33693 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33694 DAG.getConstant(Mul1, DL, VT));
33695 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
33696 DAG.getConstant(Mul2, DL, VT));
33697 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33706 // mul x, 11 => add ((shl (mul x, 5), 1), x)
33707 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
33709 // mul x, 21 => add ((shl (mul x, 5), 2), x)
33710 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
33712 // mul x, 41 => add ((shl (mul x, 5), 3), x)
33713 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
33715 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
33716 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33717 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
33719 // mul x, 19 => add ((shl (mul x, 9), 1), x)
33720 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
33722 // mul x, 37 => add ((shl (mul x, 9), 2), x)
33723 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
33725 // mul x, 73 => add ((shl (mul x, 9), 3), x)
33726 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
33728 // mul x, 13 => add ((shl (mul x, 3), 2), x)
33729 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
33731 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
33732 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
33734 // mul x, 26 => add ((mul (mul x, 5), 5), x)
33735 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
33737 // mul x, 28 => add ((mul (mul x, 9), 3), x)
33738 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
33740 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
33741 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33742 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
33745 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
33746 // by a single LEA.
33747 // First check if this a sum of two power of 2s because that's easy. Then
33748 // count how many zeros are up to the first bit.
33749 // TODO: We can do this even without LEA at a cost of two shifts and an add.
33750 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
33751 unsigned ScaleShift = countTrailingZeros(MulAmt);
33752 if (ScaleShift >= 1 && ScaleShift < 4) {
33753 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
33754 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33755 DAG.getConstant(ShiftAmt, DL, MVT::i8));
33756 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33757 DAG.getConstant(ScaleShift, DL, MVT::i8));
33758 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
33765 // If the upper 17 bits of each element are zero then we can use PMADDWD,
33766 // which is always at least as quick as PMULLD, expect on KNL.
33767 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
33768 const X86Subtarget &Subtarget) {
33769 if (!Subtarget.hasSSE2())
33772 if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
33775 EVT VT = N->getValueType(0);
33777 // Only support vXi32 vectors.
33778 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
33781 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
33782 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
33783 if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
33786 SDValue N0 = N->getOperand(0);
33787 SDValue N1 = N->getOperand(1);
33788 APInt Mask17 = APInt::getHighBitsSet(32, 17);
33789 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
33790 !DAG.MaskedValueIsZero(N0, Mask17))
33793 // Use SplitOpsAndApply to handle AVX splitting.
33794 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33795 ArrayRef<SDValue> Ops) {
33796 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
33797 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
33799 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
33800 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
33804 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
33805 const X86Subtarget &Subtarget) {
33806 if (!Subtarget.hasSSE2())
33809 EVT VT = N->getValueType(0);
33811 // Only support vXi64 vectors.
33812 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
33813 !DAG.getTargetLoweringInfo().isTypeLegal(VT))
33816 SDValue N0 = N->getOperand(0);
33817 SDValue N1 = N->getOperand(1);
33819 // MULDQ returns the 64-bit result of the signed multiplication of the lower
33820 // 32-bits. We can lower with this if the sign bits stretch that far.
33821 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
33822 DAG.ComputeNumSignBits(N1) > 32) {
33823 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33824 ArrayRef<SDValue> Ops) {
33825 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
33827 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33828 PMULDQBuilder, /*CheckBWI*/false);
33831 // If the upper bits are zero we can use a single pmuludq.
33832 APInt Mask = APInt::getHighBitsSet(64, 32);
33833 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
33834 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33835 ArrayRef<SDValue> Ops) {
33836 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
33838 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33839 PMULUDQBuilder, /*CheckBWI*/false);
33845 /// Optimize a single multiply with constant into two operations in order to
33846 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
33847 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
33848 TargetLowering::DAGCombinerInfo &DCI,
33849 const X86Subtarget &Subtarget) {
33850 EVT VT = N->getValueType(0);
33852 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
33855 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
33858 if (DCI.isBeforeLegalize() && VT.isVector())
33859 return reduceVMULWidth(N, DAG, Subtarget);
33861 if (!MulConstantOptimization)
33863 // An imul is usually smaller than the alternative sequence.
33864 if (DAG.getMachineFunction().getFunction().optForMinSize())
33867 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
33870 if (VT != MVT::i64 && VT != MVT::i32)
33873 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
33876 uint64_t MulAmt = C->getZExtValue();
33877 if (isPowerOf2_64(MulAmt))
33881 if (MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
33882 return DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33885 uint64_t MulAmt1 = 0;
33886 uint64_t MulAmt2 = 0;
33887 if ((MulAmt % 9) == 0) {
33889 MulAmt2 = MulAmt / 9;
33890 } else if ((MulAmt % 5) == 0) {
33892 MulAmt2 = MulAmt / 5;
33893 } else if ((MulAmt % 3) == 0) {
33895 MulAmt2 = MulAmt / 3;
33900 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
33902 if (isPowerOf2_64(MulAmt2) &&
33903 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
33904 // If second multiplifer is pow2, issue it first. We want the multiply by
33905 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
33907 std::swap(MulAmt1, MulAmt2);
33909 if (isPowerOf2_64(MulAmt1))
33910 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33911 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
33913 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33914 DAG.getConstant(MulAmt1, DL, VT));
33916 if (isPowerOf2_64(MulAmt2))
33917 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
33918 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
33920 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
33921 DAG.getConstant(MulAmt2, DL, VT));
33922 } else if (!Subtarget.slowLEA())
33923 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
33926 assert(MulAmt != 0 &&
33927 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
33928 "Both cases that could cause potential overflows should have "
33929 "already been handled.");
33930 int64_t SignMulAmt = C->getSExtValue();
33931 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
33932 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
33933 if (isPowerOf2_64(AbsMulAmt - 1)) {
33934 // (mul x, 2^N + 1) => (add (shl x, N), x)
33935 NewMul = DAG.getNode(
33936 ISD::ADD, DL, VT, N->getOperand(0),
33937 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33938 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
33940 // To negate, subtract the number from zero
33941 if (SignMulAmt < 0)
33942 NewMul = DAG.getNode(ISD::SUB, DL, VT,
33943 DAG.getConstant(0, DL, VT), NewMul);
33944 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
33945 // (mul x, 2^N - 1) => (sub (shl x, N), x)
33946 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33947 DAG.getConstant(Log2_64(AbsMulAmt + 1),
33949 // To negate, reverse the operands of the subtract.
33950 if (SignMulAmt < 0)
33951 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
33953 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
33954 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
33955 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
33956 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33957 DAG.getConstant(Log2_64(AbsMulAmt - 2),
33959 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
33960 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
33961 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
33962 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
33963 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33964 DAG.getConstant(Log2_64(AbsMulAmt + 2),
33966 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
33967 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
33974 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
33975 SDValue N0 = N->getOperand(0);
33976 SDValue N1 = N->getOperand(1);
33977 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
33978 EVT VT = N0.getValueType();
33980 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
33981 // since the result of setcc_c is all zero's or all ones.
33982 if (VT.isInteger() && !VT.isVector() &&
33983 N1C && N0.getOpcode() == ISD::AND &&
33984 N0.getOperand(1).getOpcode() == ISD::Constant) {
33985 SDValue N00 = N0.getOperand(0);
33986 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
33987 Mask <<= N1C->getAPIntValue();
33988 bool MaskOK = false;
33989 // We can handle cases concerning bit-widening nodes containing setcc_c if
33990 // we carefully interrogate the mask to make sure we are semantics
33992 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
33993 // of the underlying setcc_c operation if the setcc_c was zero extended.
33994 // Consider the following example:
33995 // zext(setcc_c) -> i32 0x0000FFFF
33996 // c1 -> i32 0x0000FFFF
33997 // c2 -> i32 0x00000001
33998 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
33999 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
34000 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34002 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
34003 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
34005 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
34006 N00.getOpcode() == ISD::ANY_EXTEND) &&
34007 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
34008 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
34010 if (MaskOK && Mask != 0) {
34012 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
34016 // Hardware support for vector shifts is sparse which makes us scalarize the
34017 // vector operations in many cases. Also, on sandybridge ADD is faster than
34019 // (shl V, 1) -> add V,V
34020 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
34021 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
34022 assert(N0.getValueType().isVector() && "Invalid vector shift type");
34023 // We shift all of the values by one. In many cases we do not have
34024 // hardware support for this operation. This is better expressed as an ADD
34026 if (N1SplatC->getAPIntValue() == 1)
34027 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
34033 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
34034 SDValue N0 = N->getOperand(0);
34035 SDValue N1 = N->getOperand(1);
34036 EVT VT = N0.getValueType();
34037 unsigned Size = VT.getSizeInBits();
34039 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
34040 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
34041 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
34042 // depending on sign of (SarConst - [56,48,32,24,16])
34044 // sexts in X86 are MOVs. The MOVs have the same code size
34045 // as above SHIFTs (only SHIFT on 1 has lower code size).
34046 // However the MOVs have 2 advantages to a SHIFT:
34047 // 1. MOVs can write to a register that differs from source
34048 // 2. MOVs accept memory operands
34050 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
34051 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
34052 N0.getOperand(1).getOpcode() != ISD::Constant)
34055 SDValue N00 = N0.getOperand(0);
34056 SDValue N01 = N0.getOperand(1);
34057 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
34058 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
34059 EVT CVT = N1.getValueType();
34061 if (SarConst.isNegative())
34064 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
34065 unsigned ShiftSize = SVT.getSizeInBits();
34066 // skipping types without corresponding sext/zext and
34067 // ShlConst that is not one of [56,48,32,24,16]
34068 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
34072 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
34073 SarConst = SarConst - (Size - ShiftSize);
34076 else if (SarConst.isNegative())
34077 return DAG.getNode(ISD::SHL, DL, VT, NN,
34078 DAG.getConstant(-SarConst, DL, CVT));
34080 return DAG.getNode(ISD::SRA, DL, VT, NN,
34081 DAG.getConstant(SarConst, DL, CVT));
34086 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
34087 TargetLowering::DAGCombinerInfo &DCI) {
34088 SDValue N0 = N->getOperand(0);
34089 SDValue N1 = N->getOperand(1);
34090 EVT VT = N0.getValueType();
34092 // Only do this on the last DAG combine as it can interfere with other
34094 if (!DCI.isAfterLegalizeDAG())
34097 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
34098 // TODO: This is a generic DAG combine that became an x86-only combine to
34099 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
34100 // and-not ('andn').
34101 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
34104 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
34105 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
34106 if (!ShiftC || !AndC)
34109 // If we can shrink the constant mask below 8-bits or 32-bits, then this
34110 // transform should reduce code size. It may also enable secondary transforms
34111 // from improved known-bits analysis or instruction selection.
34112 APInt MaskVal = AndC->getAPIntValue();
34114 // If this can be matched by a zero extend, don't optimize.
34115 if (MaskVal.isMask()) {
34116 unsigned TO = MaskVal.countTrailingOnes();
34117 if (TO >= 8 && isPowerOf2_32(TO))
34121 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
34122 unsigned OldMaskSize = MaskVal.getMinSignedBits();
34123 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
34124 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
34125 (OldMaskSize > 32 && NewMaskSize <= 32)) {
34126 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
34128 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
34129 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
34130 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
34135 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
34136 TargetLowering::DAGCombinerInfo &DCI,
34137 const X86Subtarget &Subtarget) {
34138 if (N->getOpcode() == ISD::SHL)
34139 if (SDValue V = combineShiftLeft(N, DAG))
34142 if (N->getOpcode() == ISD::SRA)
34143 if (SDValue V = combineShiftRightArithmetic(N, DAG))
34146 if (N->getOpcode() == ISD::SRL)
34147 if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
34153 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
34154 TargetLowering::DAGCombinerInfo &DCI,
34155 const X86Subtarget &Subtarget) {
34156 unsigned Opcode = N->getOpcode();
34157 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
34158 "Unexpected shift opcode");
34160 EVT VT = N->getValueType(0);
34161 SDValue N0 = N->getOperand(0);
34162 SDValue N1 = N->getOperand(1);
34163 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
34164 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
34165 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
34166 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
34167 "Unexpected PACKSS/PACKUS input type");
34169 // Constant Folding.
34170 APInt UndefElts0, UndefElts1;
34171 SmallVector<APInt, 32> EltBits0, EltBits1;
34172 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
34173 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
34174 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
34175 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
34176 unsigned NumLanes = VT.getSizeInBits() / 128;
34177 unsigned NumDstElts = VT.getVectorNumElements();
34178 unsigned NumSrcElts = NumDstElts / 2;
34179 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
34180 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
34181 bool IsSigned = (X86ISD::PACKSS == Opcode);
34183 APInt Undefs(NumDstElts, 0);
34184 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
34185 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
34186 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
34187 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
34188 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
34189 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
34191 if (UndefElts[SrcIdx]) {
34192 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
34196 APInt &Val = EltBits[SrcIdx];
34198 // PACKSS: Truncate signed value with signed saturation.
34199 // Source values less than dst minint are saturated to minint.
34200 // Source values greater than dst maxint are saturated to maxint.
34201 if (Val.isSignedIntN(DstBitsPerElt))
34202 Val = Val.trunc(DstBitsPerElt);
34203 else if (Val.isNegative())
34204 Val = APInt::getSignedMinValue(DstBitsPerElt);
34206 Val = APInt::getSignedMaxValue(DstBitsPerElt);
34208 // PACKUS: Truncate signed value with unsigned saturation.
34209 // Source values less than zero are saturated to zero.
34210 // Source values greater than dst maxuint are saturated to maxuint.
34211 if (Val.isIntN(DstBitsPerElt))
34212 Val = Val.trunc(DstBitsPerElt);
34213 else if (Val.isNegative())
34214 Val = APInt::getNullValue(DstBitsPerElt);
34216 Val = APInt::getAllOnesValue(DstBitsPerElt);
34218 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
34222 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
34225 // Attempt to combine as shuffle.
34228 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34229 /*HasVarMask*/ false, DAG, Subtarget))
34235 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
34236 TargetLowering::DAGCombinerInfo &DCI,
34237 const X86Subtarget &Subtarget) {
34238 unsigned Opcode = N->getOpcode();
34239 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
34240 X86ISD::VSRLI == Opcode) &&
34241 "Unexpected shift opcode");
34242 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
34243 EVT VT = N->getValueType(0);
34244 SDValue N0 = N->getOperand(0);
34245 SDValue N1 = N->getOperand(1);
34246 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
34247 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
34248 "Unexpected value type");
34250 // Out of range logical bit shifts are guaranteed to be zero.
34251 // Out of range arithmetic bit shifts splat the sign bit.
34252 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
34253 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
34255 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34257 ShiftVal = NumBitsPerElt - 1;
34260 // Shift N0 by zero -> N0.
34264 // Shift zero -> zero.
34265 if (ISD::isBuildVectorAllZeros(N0.getNode()))
34266 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34268 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
34269 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
34270 // TODO - support other sra opcodes as needed.
34271 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
34272 N0.getOpcode() == X86ISD::VSRAI)
34273 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
34275 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
34276 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
34277 N1 == N0.getOperand(1)) {
34278 SDValue N00 = N0.getOperand(0);
34279 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
34280 if (ShiftVal.ult(NumSignBits))
34284 // We can decode 'whole byte' logical bit shifts as shuffles.
34285 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
34287 if (SDValue Res = combineX86ShufflesRecursively(
34288 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34289 /*HasVarMask*/ false, DAG, Subtarget))
34293 // Constant Folding.
34295 SmallVector<APInt, 32> EltBits;
34296 if (N->isOnlyUserOf(N0.getNode()) &&
34297 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
34298 assert(EltBits.size() == VT.getVectorNumElements() &&
34299 "Unexpected shift value type");
34300 unsigned ShiftImm = ShiftVal.getZExtValue();
34301 for (APInt &Elt : EltBits) {
34302 if (X86ISD::VSHLI == Opcode)
34304 else if (X86ISD::VSRAI == Opcode)
34305 Elt.ashrInPlace(ShiftImm);
34307 Elt.lshrInPlace(ShiftImm);
34309 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
34315 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
34316 TargetLowering::DAGCombinerInfo &DCI,
34317 const X86Subtarget &Subtarget) {
34319 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
34320 (N->getOpcode() == X86ISD::PINSRW &&
34321 N->getValueType(0) == MVT::v8i16)) &&
34322 "Unexpected vector insertion");
34324 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
34327 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34328 /*HasVarMask*/ false, DAG, Subtarget))
34334 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
34335 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
34336 /// OR -> CMPNEQSS.
34337 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
34338 TargetLowering::DAGCombinerInfo &DCI,
34339 const X86Subtarget &Subtarget) {
34342 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
34343 // we're requiring SSE2 for both.
34344 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
34345 SDValue N0 = N->getOperand(0);
34346 SDValue N1 = N->getOperand(1);
34347 SDValue CMP0 = N0->getOperand(1);
34348 SDValue CMP1 = N1->getOperand(1);
34351 // The SETCCs should both refer to the same CMP.
34352 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
34355 SDValue CMP00 = CMP0->getOperand(0);
34356 SDValue CMP01 = CMP0->getOperand(1);
34357 EVT VT = CMP00.getValueType();
34359 if (VT == MVT::f32 || VT == MVT::f64) {
34360 bool ExpectingFlags = false;
34361 // Check for any users that want flags:
34362 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
34363 !ExpectingFlags && UI != UE; ++UI)
34364 switch (UI->getOpcode()) {
34369 ExpectingFlags = true;
34371 case ISD::CopyToReg:
34372 case ISD::SIGN_EXTEND:
34373 case ISD::ZERO_EXTEND:
34374 case ISD::ANY_EXTEND:
34378 if (!ExpectingFlags) {
34379 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
34380 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
34382 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
34383 X86::CondCode tmp = cc0;
34388 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
34389 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
34390 // FIXME: need symbolic constants for these magic numbers.
34391 // See X86ATTInstPrinter.cpp:printSSECC().
34392 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
34393 if (Subtarget.hasAVX512()) {
34395 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
34396 DAG.getConstant(x86cc, DL, MVT::i8));
34397 // Need to fill with zeros to ensure the bitcast will produce zeroes
34398 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
34399 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
34400 DAG.getConstant(0, DL, MVT::v16i1),
34401 FSetCC, DAG.getIntPtrConstant(0, DL));
34402 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
34403 N->getSimpleValueType(0));
34405 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
34406 CMP00.getValueType(), CMP00, CMP01,
34407 DAG.getConstant(x86cc, DL,
34410 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
34411 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
34413 if (is64BitFP && !Subtarget.is64Bit()) {
34414 // On a 32-bit target, we cannot bitcast the 64-bit float to a
34415 // 64-bit integer, since that's not a legal type. Since
34416 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
34417 // bits, but can do this little dance to extract the lowest 32 bits
34418 // and work with those going forward.
34419 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
34421 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
34422 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
34423 Vector32, DAG.getIntPtrConstant(0, DL));
34427 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
34428 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
34429 DAG.getConstant(1, DL, IntVT));
34430 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
34432 return OneBitOfTruth;
34440 // Try to match (and (xor X, -1), Y) logic pattern for (andnp X, Y) combines.
34441 static bool matchANDXORWithAllOnesAsANDNP(SDNode *N, SDValue &X, SDValue &Y) {
34442 if (N->getOpcode() != ISD::AND)
34445 SDValue N0 = N->getOperand(0);
34446 SDValue N1 = N->getOperand(1);
34447 if (N0.getOpcode() == ISD::XOR &&
34448 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
34449 X = N0.getOperand(0);
34453 if (N1.getOpcode() == ISD::XOR &&
34454 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
34455 X = N1.getOperand(0);
34463 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
34464 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
34465 assert(N->getOpcode() == ISD::AND);
34467 EVT VT = N->getValueType(0);
34468 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
34472 if (matchANDXORWithAllOnesAsANDNP(N, X, Y))
34473 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
34478 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
34479 // register. In most cases we actually compare or select YMM-sized registers
34480 // and mixing the two types creates horrible code. This method optimizes
34481 // some of the transition sequences.
34482 // Even with AVX-512 this is still useful for removing casts around logical
34483 // operations on vXi1 mask types.
34484 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
34485 const X86Subtarget &Subtarget) {
34486 EVT VT = N->getValueType(0);
34487 assert(VT.isVector() && "Expected vector type");
34489 assert((N->getOpcode() == ISD::ANY_EXTEND ||
34490 N->getOpcode() == ISD::ZERO_EXTEND ||
34491 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
34493 SDValue Narrow = N->getOperand(0);
34494 EVT NarrowVT = Narrow.getValueType();
34496 if (Narrow->getOpcode() != ISD::XOR &&
34497 Narrow->getOpcode() != ISD::AND &&
34498 Narrow->getOpcode() != ISD::OR)
34501 SDValue N0 = Narrow->getOperand(0);
34502 SDValue N1 = Narrow->getOperand(1);
34505 // The Left side has to be a trunc.
34506 if (N0.getOpcode() != ISD::TRUNCATE)
34509 // The type of the truncated inputs.
34510 if (N0->getOperand(0).getValueType() != VT)
34513 // The right side has to be a 'trunc' or a constant vector.
34514 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
34515 N1.getOperand(0).getValueType() == VT;
34517 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
34520 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34522 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
34525 // Set N0 and N1 to hold the inputs to the new wide operation.
34526 N0 = N0->getOperand(0);
34528 N1 = N1->getOperand(0);
34530 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
34532 // Generate the wide operation.
34533 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
34534 unsigned Opcode = N->getOpcode();
34536 default: llvm_unreachable("Unexpected opcode");
34537 case ISD::ANY_EXTEND:
34539 case ISD::ZERO_EXTEND:
34540 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
34541 case ISD::SIGN_EXTEND:
34542 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
34543 Op, DAG.getValueType(NarrowVT));
34547 /// If both input operands of a logic op are being cast from floating point
34548 /// types, try to convert this into a floating point logic node to avoid
34549 /// unnecessary moves from SSE to integer registers.
34550 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
34551 const X86Subtarget &Subtarget) {
34552 unsigned FPOpcode = ISD::DELETED_NODE;
34553 if (N->getOpcode() == ISD::AND)
34554 FPOpcode = X86ISD::FAND;
34555 else if (N->getOpcode() == ISD::OR)
34556 FPOpcode = X86ISD::FOR;
34557 else if (N->getOpcode() == ISD::XOR)
34558 FPOpcode = X86ISD::FXOR;
34560 assert(FPOpcode != ISD::DELETED_NODE &&
34561 "Unexpected input node for FP logic conversion");
34563 EVT VT = N->getValueType(0);
34564 SDValue N0 = N->getOperand(0);
34565 SDValue N1 = N->getOperand(1);
34567 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
34568 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
34569 (Subtarget.hasSSE2() && VT == MVT::i64))) {
34570 SDValue N00 = N0.getOperand(0);
34571 SDValue N10 = N1.getOperand(0);
34572 EVT N00Type = N00.getValueType();
34573 EVT N10Type = N10.getValueType();
34574 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
34575 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
34576 return DAG.getBitcast(VT, FPLogic);
34582 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
34583 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
34584 /// with a shift-right to eliminate loading the vector constant mask value.
34585 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
34586 const X86Subtarget &Subtarget) {
34587 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
34588 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
34589 EVT VT0 = Op0.getValueType();
34590 EVT VT1 = Op1.getValueType();
34592 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
34596 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
34597 !SplatVal.isMask())
34600 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
34603 unsigned EltBitWidth = VT0.getScalarSizeInBits();
34604 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
34608 unsigned ShiftVal = SplatVal.countTrailingOnes();
34609 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
34610 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
34611 return DAG.getBitcast(N->getValueType(0), Shift);
34614 // Get the index node from the lowered DAG of a GEP IR instruction with one
34615 // indexing dimension.
34616 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
34617 if (Ld->isIndexed())
34620 SDValue Base = Ld->getBasePtr();
34622 if (Base.getOpcode() != ISD::ADD)
34625 SDValue ShiftedIndex = Base.getOperand(0);
34627 if (ShiftedIndex.getOpcode() != ISD::SHL)
34630 return ShiftedIndex.getOperand(0);
34634 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
34635 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
34636 switch (VT.getSizeInBits()) {
34637 default: return false;
34638 case 64: return Subtarget.is64Bit() ? true : false;
34639 case 32: return true;
34645 // This function recognizes cases where X86 bzhi instruction can replace and
34646 // 'and-load' sequence.
34647 // In case of loading integer value from an array of constants which is defined
34650 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
34652 // then applying a bitwise and on the result with another input.
34653 // It's equivalent to performing bzhi (zero high bits) on the input, with the
34654 // same index of the load.
34655 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
34656 const X86Subtarget &Subtarget) {
34657 MVT VT = Node->getSimpleValueType(0);
34660 // Check if subtarget has BZHI instruction for the node's type
34661 if (!hasBZHI(Subtarget, VT))
34664 // Try matching the pattern for both operands.
34665 for (unsigned i = 0; i < 2; i++) {
34666 SDValue N = Node->getOperand(i);
34667 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
34669 // continue if the operand is not a load instruction
34673 const Value *MemOp = Ld->getMemOperand()->getValue();
34678 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
34679 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
34680 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
34682 Constant *Init = GV->getInitializer();
34683 Type *Ty = Init->getType();
34684 if (!isa<ConstantDataArray>(Init) ||
34685 !Ty->getArrayElementType()->isIntegerTy() ||
34686 Ty->getArrayElementType()->getScalarSizeInBits() !=
34687 VT.getSizeInBits() ||
34688 Ty->getArrayNumElements() >
34689 Ty->getArrayElementType()->getScalarSizeInBits())
34692 // Check if the array's constant elements are suitable to our case.
34693 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
34694 bool ConstantsMatch = true;
34695 for (uint64_t j = 0; j < ArrayElementCount; j++) {
34696 ConstantInt *Elem =
34697 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
34698 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
34699 ConstantsMatch = false;
34703 if (!ConstantsMatch)
34706 // Do the transformation (For 32-bit type):
34707 // -> (and (load arr[idx]), inp)
34708 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
34709 // that will be replaced with one bzhi instruction.
34710 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
34711 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
34713 // Get the Node which indexes into the array.
34714 SDValue Index = getIndexFromUnindexedLoad(Ld);
34717 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
34719 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
34720 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
34722 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
34723 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
34725 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
34733 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
34734 TargetLowering::DAGCombinerInfo &DCI,
34735 const X86Subtarget &Subtarget) {
34736 EVT VT = N->getValueType(0);
34738 // If this is SSE1 only convert to FAND to avoid scalarization.
34739 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
34740 return DAG.getBitcast(
34741 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
34742 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
34743 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
34746 // Use a 32-bit and+zext if upper bits known zero.
34747 if (VT == MVT::i64 && Subtarget.is64Bit() &&
34748 !isa<ConstantSDNode>(N->getOperand(1))) {
34749 APInt HiMask = APInt::getHighBitsSet(64, 32);
34750 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
34751 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
34753 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
34754 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
34755 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
34756 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
34760 if (DCI.isBeforeLegalizeOps())
34763 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
34766 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34769 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
34772 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
34775 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
34778 // Attempt to recursively combine a bitmask AND with shuffles.
34779 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34781 if (SDValue Res = combineX86ShufflesRecursively(
34782 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34783 /*HasVarMask*/ false, DAG, Subtarget))
34787 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
34788 if ((VT.getScalarSizeInBits() % 8) == 0 &&
34789 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34790 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
34791 SDValue BitMask = N->getOperand(1);
34792 SDValue SrcVec = N->getOperand(0).getOperand(0);
34793 EVT SrcVecVT = SrcVec.getValueType();
34795 // Check that the constant bitmask masks whole bytes.
34797 SmallVector<APInt, 64> EltBits;
34798 if (VT == SrcVecVT.getScalarType() &&
34799 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
34800 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
34801 llvm::all_of(EltBits, [](APInt M) {
34802 return M.isNullValue() || M.isAllOnesValue();
34804 unsigned NumElts = SrcVecVT.getVectorNumElements();
34805 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
34806 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
34808 // Create a root shuffle mask from the byte mask and the extracted index.
34809 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
34810 for (unsigned i = 0; i != Scale; ++i) {
34813 int VecIdx = Scale * Idx + i;
34814 ShuffleMask[VecIdx] =
34815 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
34818 if (SDValue Shuffle = combineX86ShufflesRecursively(
34819 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
34820 /*HasVarMask*/ false, DAG, Subtarget))
34821 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
34822 N->getOperand(0).getOperand(1));
34829 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
34830 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
34831 if (N->getOpcode() != ISD::OR)
34834 SDValue N0 = N->getOperand(0);
34835 SDValue N1 = N->getOperand(1);
34837 // Canonicalize AND to LHS.
34838 if (N1.getOpcode() == ISD::AND)
34841 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
34842 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
34845 Mask = N1.getOperand(0);
34846 X = N1.getOperand(1);
34848 // Check to see if the mask appeared in both the AND and ANDNP.
34849 if (N0.getOperand(0) == Mask)
34850 Y = N0.getOperand(1);
34851 else if (N0.getOperand(1) == Mask)
34852 Y = N0.getOperand(0);
34856 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
34857 // ANDNP combine allows other combines to happen that prevent matching.
34862 // (or (and (m, y), (pandn m, x)))
34864 // (vselect m, x, y)
34865 // As a special case, try to fold:
34866 // (or (and (m, (sub 0, x)), (pandn m, x)))
34868 // (sub (xor X, M), M)
34869 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
34870 const X86Subtarget &Subtarget) {
34871 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
34873 EVT VT = N->getValueType(0);
34874 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
34875 (VT.is256BitVector() && Subtarget.hasInt256())))
34878 SDValue X, Y, Mask;
34879 if (!matchLogicBlend(N, X, Y, Mask))
34882 // Validate that X, Y, and Mask are bitcasts, and see through them.
34883 Mask = peekThroughBitcasts(Mask);
34884 X = peekThroughBitcasts(X);
34885 Y = peekThroughBitcasts(Y);
34887 EVT MaskVT = Mask.getValueType();
34888 unsigned EltBits = MaskVT.getScalarSizeInBits();
34890 // TODO: Attempt to handle floating point cases as well?
34891 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
34897 // (or (and (M, (sub 0, X)), (pandn M, X)))
34898 // which is a special case of vselect:
34899 // (vselect M, (sub 0, X), X)
34901 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
34902 // We know that, if fNegate is 0 or 1:
34903 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
34905 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
34906 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
34907 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
34908 // This lets us transform our vselect to:
34909 // (add (xor X, M), (and M, 1))
34911 // (sub (xor X, M), M)
34912 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
34913 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
34914 auto IsNegV = [](SDNode *N, SDValue V) {
34915 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
34916 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
34919 if (IsNegV(Y.getNode(), X))
34921 else if (IsNegV(X.getNode(), Y))
34925 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
34926 SDValue SubOp2 = Mask;
34928 // If the negate was on the false side of the select, then
34929 // the operands of the SUB need to be swapped. PR 27251.
34930 // This is because the pattern being matched above is
34931 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
34932 // but if the pattern matched was
34933 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
34934 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
34935 // pattern also needs to be a negation of the replacement pattern above.
34936 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
34937 // sub accomplishes the negation of the replacement pattern.
34939 std::swap(SubOp1, SubOp2);
34941 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
34942 return DAG.getBitcast(VT, Res);
34946 // PBLENDVB is only available on SSE 4.1.
34947 if (!Subtarget.hasSSE41())
34950 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
34952 X = DAG.getBitcast(BlendVT, X);
34953 Y = DAG.getBitcast(BlendVT, Y);
34954 Mask = DAG.getBitcast(BlendVT, Mask);
34955 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
34956 return DAG.getBitcast(VT, Mask);
34959 // Helper function for combineOrCmpEqZeroToCtlzSrl
34963 // srl(ctlz x), log2(bitsize(x))
34964 // Input pattern is checked by caller.
34965 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
34966 SelectionDAG &DAG) {
34967 SDValue Cmp = Op.getOperand(1);
34968 EVT VT = Cmp.getOperand(0).getValueType();
34969 unsigned Log2b = Log2_32(VT.getSizeInBits());
34971 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
34972 // The result of the shift is true or false, and on X86, the 32-bit
34973 // encoding of shr and lzcnt is more desirable.
34974 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
34975 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
34976 DAG.getConstant(Log2b, dl, MVT::i8));
34977 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
34980 // Try to transform:
34981 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
34983 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
34984 // Will also attempt to match more generic cases, eg:
34985 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
34986 // Only applies if the target supports the FastLZCNT feature.
34987 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
34988 TargetLowering::DAGCombinerInfo &DCI,
34989 const X86Subtarget &Subtarget) {
34990 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
34993 auto isORCandidate = [](SDValue N) {
34994 return (N->getOpcode() == ISD::OR && N->hasOneUse());
34997 // Check the zero extend is extending to 32-bit or more. The code generated by
34998 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
34999 // instructions to clear the upper bits.
35000 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
35001 !isORCandidate(N->getOperand(0)))
35004 // Check the node matches: setcc(eq, cmp 0)
35005 auto isSetCCCandidate = [](SDValue N) {
35006 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
35007 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
35008 N->getOperand(1).getOpcode() == X86ISD::CMP &&
35009 isNullConstant(N->getOperand(1).getOperand(1)) &&
35010 N->getOperand(1).getValueType().bitsGE(MVT::i32);
35013 SDNode *OR = N->getOperand(0).getNode();
35014 SDValue LHS = OR->getOperand(0);
35015 SDValue RHS = OR->getOperand(1);
35017 // Save nodes matching or(or, setcc(eq, cmp 0)).
35018 SmallVector<SDNode *, 2> ORNodes;
35019 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
35020 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
35021 ORNodes.push_back(OR);
35022 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
35023 LHS = OR->getOperand(0);
35024 RHS = OR->getOperand(1);
35027 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
35028 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
35029 !isORCandidate(SDValue(OR, 0)))
35032 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
35034 // or(srl(ctlz),srl(ctlz)).
35035 // The dag combiner can then fold it into:
35036 // srl(or(ctlz, ctlz)).
35037 EVT VT = OR->getValueType(0);
35038 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
35039 SDValue Ret, NewRHS;
35040 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
35041 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
35046 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
35047 while (ORNodes.size() > 0) {
35048 OR = ORNodes.pop_back_val();
35049 LHS = OR->getOperand(0);
35050 RHS = OR->getOperand(1);
35051 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
35052 if (RHS->getOpcode() == ISD::OR)
35053 std::swap(LHS, RHS);
35054 EVT VT = OR->getValueType(0);
35055 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
35058 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
35062 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
35067 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
35068 TargetLowering::DAGCombinerInfo &DCI,
35069 const X86Subtarget &Subtarget) {
35070 SDValue N0 = N->getOperand(0);
35071 SDValue N1 = N->getOperand(1);
35072 EVT VT = N->getValueType(0);
35074 // If this is SSE1 only convert to FOR to avoid scalarization.
35075 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
35076 return DAG.getBitcast(MVT::v4i32,
35077 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
35078 DAG.getBitcast(MVT::v4f32, N0),
35079 DAG.getBitcast(MVT::v4f32, N1)));
35082 if (DCI.isBeforeLegalizeOps())
35085 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
35088 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35091 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
35094 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
35097 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
35098 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
35100 // SHLD/SHRD instructions have lower register pressure, but on some
35101 // platforms they have higher latency than the equivalent
35102 // series of shifts/or that would otherwise be generated.
35103 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
35104 // have higher latencies and we are not optimizing for size.
35105 if (!OptForSize && Subtarget.isSHLDSlow())
35108 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
35110 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
35112 if (!N0.hasOneUse() || !N1.hasOneUse())
35115 SDValue ShAmt0 = N0.getOperand(1);
35116 if (ShAmt0.getValueType() != MVT::i8)
35118 SDValue ShAmt1 = N1.getOperand(1);
35119 if (ShAmt1.getValueType() != MVT::i8)
35121 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
35122 ShAmt0 = ShAmt0.getOperand(0);
35123 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
35124 ShAmt1 = ShAmt1.getOperand(0);
35127 unsigned Opc = X86ISD::SHLD;
35128 SDValue Op0 = N0.getOperand(0);
35129 SDValue Op1 = N1.getOperand(0);
35130 if (ShAmt0.getOpcode() == ISD::SUB ||
35131 ShAmt0.getOpcode() == ISD::XOR) {
35132 Opc = X86ISD::SHRD;
35133 std::swap(Op0, Op1);
35134 std::swap(ShAmt0, ShAmt1);
35137 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
35138 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
35139 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
35140 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
35141 unsigned Bits = VT.getSizeInBits();
35142 if (ShAmt1.getOpcode() == ISD::SUB) {
35143 SDValue Sum = ShAmt1.getOperand(0);
35144 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
35145 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
35146 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
35147 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
35148 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
35149 return DAG.getNode(Opc, DL, VT,
35151 DAG.getNode(ISD::TRUNCATE, DL,
35154 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
35155 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
35156 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
35157 return DAG.getNode(Opc, DL, VT,
35158 N0.getOperand(0), N1.getOperand(0),
35159 DAG.getNode(ISD::TRUNCATE, DL,
35161 } else if (ShAmt1.getOpcode() == ISD::XOR) {
35162 SDValue Mask = ShAmt1.getOperand(1);
35163 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
35164 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
35165 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
35166 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
35167 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
35168 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
35169 if (Op1.getOpcode() == InnerShift &&
35170 isa<ConstantSDNode>(Op1.getOperand(1)) &&
35171 Op1.getConstantOperandVal(1) == 1) {
35172 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35173 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35175 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
35176 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
35177 Op1.getOperand(0) == Op1.getOperand(1)) {
35178 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35179 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35188 /// Try to turn tests against the signbit in the form of:
35189 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
35192 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
35193 // This is only worth doing if the output type is i8 or i1.
35194 EVT ResultType = N->getValueType(0);
35195 if (ResultType != MVT::i8 && ResultType != MVT::i1)
35198 SDValue N0 = N->getOperand(0);
35199 SDValue N1 = N->getOperand(1);
35201 // We should be performing an xor against a truncated shift.
35202 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
35205 // Make sure we are performing an xor against one.
35206 if (!isOneConstant(N1))
35209 // SetCC on x86 zero extends so only act on this if it's a logical shift.
35210 SDValue Shift = N0.getOperand(0);
35211 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
35214 // Make sure we are truncating from one of i16, i32 or i64.
35215 EVT ShiftTy = Shift.getValueType();
35216 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
35219 // Make sure the shift amount extracts the sign bit.
35220 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
35221 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
35224 // Create a greater-than comparison against -1.
35225 // N.B. Using SETGE against 0 works but we want a canonical looking
35226 // comparison, using SETGT matches up with what TranslateX86CC.
35228 SDValue ShiftOp = Shift.getOperand(0);
35229 EVT ShiftOpTy = ShiftOp.getValueType();
35230 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35231 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
35232 *DAG.getContext(), ResultType);
35233 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
35234 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
35235 if (SetCCResultType != ResultType)
35236 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
35240 /// Turn vector tests of the signbit in the form of:
35241 /// xor (sra X, elt_size(X)-1), -1
35245 /// This should be called before type legalization because the pattern may not
35246 /// persist after that.
35247 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
35248 const X86Subtarget &Subtarget) {
35249 EVT VT = N->getValueType(0);
35250 if (!VT.isSimple())
35253 switch (VT.getSimpleVT().SimpleTy) {
35254 default: return SDValue();
35257 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
35258 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
35262 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
35265 // There must be a shift right algebraic before the xor, and the xor must be a
35266 // 'not' operation.
35267 SDValue Shift = N->getOperand(0);
35268 SDValue Ones = N->getOperand(1);
35269 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
35270 !ISD::isBuildVectorAllOnes(Ones.getNode()))
35273 // The shift should be smearing the sign bit across each vector element.
35274 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
35278 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
35279 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
35280 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
35283 // Create a greater-than comparison against -1. We don't use the more obvious
35284 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
35285 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
35288 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
35289 /// is valid for the given \p Subtarget.
35290 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
35291 const X86Subtarget &Subtarget) {
35292 if (!Subtarget.hasAVX512())
35295 // FIXME: Scalar type may be supported if we move it to vector register.
35296 if (!SrcVT.isVector())
35299 EVT SrcElVT = SrcVT.getScalarType();
35300 EVT DstElVT = DstVT.getScalarType();
35301 if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
35303 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
35304 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
35308 /// Detect patterns of truncation with unsigned saturation:
35310 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35311 /// Return the source value x to be truncated or SDValue() if the pattern was
35314 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
35315 /// where C1 >= 0 and C2 is unsigned max of destination type.
35317 /// (truncate (smax (smin (x, C2), C1)) to dest_type)
35318 /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
35320 /// These two patterns are equivalent to:
35321 /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
35322 /// So return the smax(x, C1) value to be truncated or SDValue() if the
35323 /// pattern was not matched.
35324 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35326 EVT InVT = In.getValueType();
35328 // Saturation with truncation. We truncate from InVT to VT.
35329 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
35330 "Unexpected types for truncate operation");
35332 // Match min/max and return limit value as a parameter.
35333 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
35334 if (V.getOpcode() == Opcode &&
35335 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
35336 return V.getOperand(0);
35341 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
35342 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
35343 // the element size of the destination type.
35344 if (C2.isMask(VT.getScalarSizeInBits()))
35347 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
35348 if (MatchMinMax(SMin, ISD::SMAX, C1))
35349 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
35352 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
35353 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
35354 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
35356 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
35362 /// Detect patterns of truncation with signed saturation:
35363 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
35364 /// signed_max_of_dest_type)) to dest_type)
35366 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
35367 /// signed_min_of_dest_type)) to dest_type).
35368 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
35369 /// Return the source value to be truncated or SDValue() if the pattern was not
35371 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
35372 unsigned NumDstBits = VT.getScalarSizeInBits();
35373 unsigned NumSrcBits = In.getScalarValueSizeInBits();
35374 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
35376 auto MatchMinMax = [](SDValue V, unsigned Opcode,
35377 const APInt &Limit) -> SDValue {
35379 if (V.getOpcode() == Opcode &&
35380 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
35381 return V.getOperand(0);
35385 APInt SignedMax, SignedMin;
35387 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
35388 SignedMin = APInt(NumSrcBits, 0);
35390 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
35391 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
35394 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
35395 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
35398 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
35399 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
35405 /// Detect a pattern of truncation with signed saturation.
35406 /// The types should allow to use VPMOVSS* instruction on AVX512.
35407 /// Return the source value to be truncated or SDValue() if the pattern was not
35409 static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
35410 const X86Subtarget &Subtarget,
35411 const TargetLowering &TLI) {
35412 if (!TLI.isTypeLegal(In.getValueType()))
35414 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35416 return detectSSatPattern(In, VT);
35419 /// Detect a pattern of truncation with saturation:
35420 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35421 /// The types should allow to use VPMOVUS* instruction on AVX512.
35422 /// Return the source value to be truncated or SDValue() if the pattern was not
35424 static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35426 const X86Subtarget &Subtarget,
35427 const TargetLowering &TLI) {
35428 if (!TLI.isTypeLegal(In.getValueType()))
35430 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35432 return detectUSatPattern(In, VT, DAG, DL);
35435 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
35437 const X86Subtarget &Subtarget) {
35438 EVT SVT = VT.getScalarType();
35439 EVT InVT = In.getValueType();
35440 EVT InSVT = InVT.getScalarType();
35441 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35442 if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
35443 isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
35444 if (auto SSatVal = detectSSatPattern(In, VT))
35445 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
35446 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
35447 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
35449 if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
35450 (SVT == MVT::i8 || SVT == MVT::i16) &&
35451 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
35452 if (auto USatVal = detectSSatPattern(In, VT, true)) {
35453 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
35454 if (SVT == MVT::i8 && InSVT == MVT::i32) {
35455 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35456 VT.getVectorNumElements());
35457 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
35460 return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
35462 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
35463 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
35466 if (auto SSatVal = detectSSatPattern(In, VT))
35467 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
35473 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
35474 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
35475 /// X86ISD::AVG instruction.
35476 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35477 const X86Subtarget &Subtarget,
35479 if (!VT.isVector())
35481 EVT InVT = In.getValueType();
35482 unsigned NumElems = VT.getVectorNumElements();
35484 EVT ScalarVT = VT.getVectorElementType();
35485 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
35486 isPowerOf2_32(NumElems)))
35489 // InScalarVT is the intermediate type in AVG pattern and it should be greater
35490 // than the original input type (i8/i16).
35491 EVT InScalarVT = InVT.getVectorElementType();
35492 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
35495 if (!Subtarget.hasSSE2())
35498 // Detect the following pattern:
35500 // %1 = zext <N x i8> %a to <N x i32>
35501 // %2 = zext <N x i8> %b to <N x i32>
35502 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
35503 // %4 = add nuw nsw <N x i32> %3, %2
35504 // %5 = lshr <N x i32> %N, <i32 1 x N>
35505 // %6 = trunc <N x i32> %5 to <N x i8>
35507 // In AVX512, the last instruction can also be a trunc store.
35508 if (In.getOpcode() != ISD::SRL)
35511 // A lambda checking the given SDValue is a constant vector and each element
35512 // is in the range [Min, Max].
35513 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
35514 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
35515 if (!BV || !BV->isConstant())
35517 for (SDValue Op : V->ops()) {
35518 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
35521 const APInt &Val = C->getAPIntValue();
35522 if (Val.ult(Min) || Val.ugt(Max))
35528 // Check if each element of the vector is left-shifted by one.
35529 auto LHS = In.getOperand(0);
35530 auto RHS = In.getOperand(1);
35531 if (!IsConstVectorInRange(RHS, 1, 1))
35533 if (LHS.getOpcode() != ISD::ADD)
35536 // Detect a pattern of a + b + 1 where the order doesn't matter.
35537 SDValue Operands[3];
35538 Operands[0] = LHS.getOperand(0);
35539 Operands[1] = LHS.getOperand(1);
35541 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
35542 ArrayRef<SDValue> Ops) {
35543 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
35546 // Take care of the case when one of the operands is a constant vector whose
35547 // element is in the range [1, 256].
35548 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
35549 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
35550 Operands[0].getOperand(0).getValueType() == VT) {
35551 // The pattern is detected. Subtract one from the constant vector, then
35552 // demote it and emit X86ISD::AVG instruction.
35553 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
35554 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
35555 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
35556 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35557 { Operands[0].getOperand(0), Operands[1] },
35561 if (Operands[0].getOpcode() == ISD::ADD)
35562 std::swap(Operands[0], Operands[1]);
35563 else if (Operands[1].getOpcode() != ISD::ADD)
35565 Operands[2] = Operands[1].getOperand(0);
35566 Operands[1] = Operands[1].getOperand(1);
35568 // Now we have three operands of two additions. Check that one of them is a
35569 // constant vector with ones, and the other two are promoted from i8/i16.
35570 for (int i = 0; i < 3; ++i) {
35571 if (!IsConstVectorInRange(Operands[i], 1, 1))
35573 std::swap(Operands[i], Operands[2]);
35575 // Check if Operands[0] and Operands[1] are results of type promotion.
35576 for (int j = 0; j < 2; ++j)
35577 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
35578 Operands[j].getOperand(0).getValueType() != VT)
35581 // The pattern is detected, emit X86ISD::AVG instruction(s).
35582 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35583 { Operands[0].getOperand(0),
35584 Operands[1].getOperand(0) }, AVGBuilder);
35590 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
35591 TargetLowering::DAGCombinerInfo &DCI,
35592 const X86Subtarget &Subtarget) {
35593 LoadSDNode *Ld = cast<LoadSDNode>(N);
35594 EVT RegVT = Ld->getValueType(0);
35595 EVT MemVT = Ld->getMemoryVT();
35597 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35599 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
35600 // into two 16-byte operations. Also split non-temporal aligned loads on
35601 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
35602 ISD::LoadExtType Ext = Ld->getExtensionType();
35604 unsigned AddressSpace = Ld->getAddressSpace();
35605 unsigned Alignment = Ld->getAlignment();
35606 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
35607 Ext == ISD::NON_EXTLOAD &&
35608 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
35609 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
35610 AddressSpace, Alignment, &Fast) && !Fast))) {
35611 unsigned NumElems = RegVT.getVectorNumElements();
35615 SDValue Ptr = Ld->getBasePtr();
35617 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
35620 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
35621 Alignment, Ld->getMemOperand()->getFlags());
35623 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
35625 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
35626 Ld->getPointerInfo().getWithOffset(16),
35627 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
35628 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
35630 Load2.getValue(1));
35632 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
35633 return DCI.CombineTo(N, NewVec, TF, true);
35639 /// If V is a build vector of boolean constants and exactly one of those
35640 /// constants is true, return the operand index of that true element.
35641 /// Otherwise, return -1.
35642 static int getOneTrueElt(SDValue V) {
35643 // This needs to be a build vector of booleans.
35644 // TODO: Checking for the i1 type matches the IR definition for the mask,
35645 // but the mask check could be loosened to i8 or other types. That might
35646 // also require checking more than 'allOnesValue'; eg, the x86 HW
35647 // instructions only require that the MSB is set for each mask element.
35648 // The ISD::MSTORE comments/definition do not specify how the mask operand
35650 auto *BV = dyn_cast<BuildVectorSDNode>(V);
35651 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
35654 int TrueIndex = -1;
35655 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
35656 for (unsigned i = 0; i < NumElts; ++i) {
35657 const SDValue &Op = BV->getOperand(i);
35660 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
35663 if (ConstNode->getAPIntValue().isAllOnesValue()) {
35664 // If we already found a one, this is too many.
35665 if (TrueIndex >= 0)
35673 /// Given a masked memory load/store operation, return true if it has one mask
35674 /// bit set. If it has one mask bit set, then also return the memory address of
35675 /// the scalar element to load/store, the vector index to insert/extract that
35676 /// scalar element, and the alignment for the scalar memory access.
35677 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
35678 SelectionDAG &DAG, SDValue &Addr,
35679 SDValue &Index, unsigned &Alignment) {
35680 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
35681 if (TrueMaskElt < 0)
35684 // Get the address of the one scalar element that is specified by the mask
35685 // using the appropriate offset from the base pointer.
35686 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
35687 Addr = MaskedOp->getBasePtr();
35688 if (TrueMaskElt != 0) {
35689 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
35690 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
35693 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
35694 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
35698 /// If exactly one element of the mask is set for a non-extending masked load,
35699 /// it is a scalar load and vector insert.
35700 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35701 /// mask have already been optimized in IR, so we don't bother with those here.
35703 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35704 TargetLowering::DAGCombinerInfo &DCI) {
35705 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35706 // However, some target hooks may need to be added to know when the transform
35707 // is profitable. Endianness would also have to be considered.
35709 SDValue Addr, VecIndex;
35710 unsigned Alignment;
35711 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
35714 // Load the one scalar element that is specified by the mask using the
35715 // appropriate offset from the base pointer.
35717 EVT VT = ML->getValueType(0);
35718 EVT EltVT = VT.getVectorElementType();
35720 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
35721 Alignment, ML->getMemOperand()->getFlags());
35723 // Insert the loaded element into the appropriate place in the vector.
35724 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
35726 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
35730 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35731 TargetLowering::DAGCombinerInfo &DCI) {
35732 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
35736 EVT VT = ML->getValueType(0);
35738 // If we are loading the first and last elements of a vector, it is safe and
35739 // always faster to load the whole vector. Replace the masked load with a
35740 // vector load and select.
35741 unsigned NumElts = VT.getVectorNumElements();
35742 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
35743 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
35744 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
35745 if (LoadFirstElt && LoadLastElt) {
35746 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35747 ML->getMemOperand());
35748 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
35749 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
35752 // Convert a masked load with a constant mask into a masked load and a select.
35753 // This allows the select operation to use a faster kind of select instruction
35754 // (for example, vblendvps -> vblendps).
35756 // Don't try this if the pass-through operand is already undefined. That would
35757 // cause an infinite loop because that's what we're about to create.
35758 if (ML->getSrc0().isUndef())
35761 // The new masked load has an undef pass-through operand. The select uses the
35762 // original pass-through operand.
35763 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35764 ML->getMask(), DAG.getUNDEF(VT),
35765 ML->getMemoryVT(), ML->getMemOperand(),
35766 ML->getExtensionType());
35767 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
35769 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
35772 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
35773 TargetLowering::DAGCombinerInfo &DCI,
35774 const X86Subtarget &Subtarget) {
35775 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
35777 // TODO: Expanding load with constant mask may be optimized as well.
35778 if (Mld->isExpandingLoad())
35781 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
35782 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
35784 // TODO: Do some AVX512 subsets benefit from this transform?
35785 if (!Subtarget.hasAVX512())
35786 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
35790 if (Mld->getExtensionType() != ISD::SEXTLOAD)
35793 // Resolve extending loads.
35794 EVT VT = Mld->getValueType(0);
35795 unsigned NumElems = VT.getVectorNumElements();
35796 EVT LdVT = Mld->getMemoryVT();
35799 assert(LdVT != VT && "Cannot extend to the same type");
35800 unsigned ToSz = VT.getScalarSizeInBits();
35801 unsigned FromSz = LdVT.getScalarSizeInBits();
35802 // From/To sizes and ElemCount must be pow of two.
35803 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35804 "Unexpected size for extending masked load");
35806 unsigned SizeRatio = ToSz / FromSz;
35807 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
35809 // Create a type on which we perform the shuffle.
35810 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35811 LdVT.getScalarType(), NumElems*SizeRatio);
35812 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35814 // Convert Src0 value.
35815 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
35816 if (!Mld->getSrc0().isUndef()) {
35817 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35818 for (unsigned i = 0; i != NumElems; ++i)
35819 ShuffleVec[i] = i * SizeRatio;
35821 // Can't shuffle using an illegal type.
35822 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35823 "WideVecVT should be legal");
35824 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
35825 DAG.getUNDEF(WideVecVT), ShuffleVec);
35828 // Prepare the new mask.
35830 SDValue Mask = Mld->getMask();
35831 if (Mask.getValueType() == VT) {
35832 // Mask and original value have the same type.
35833 NewMask = DAG.getBitcast(WideVecVT, Mask);
35834 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35835 for (unsigned i = 0; i != NumElems; ++i)
35836 ShuffleVec[i] = i * SizeRatio;
35837 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
35838 ShuffleVec[i] = NumElems * SizeRatio;
35839 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35840 DAG.getConstant(0, dl, WideVecVT),
35843 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35844 unsigned WidenNumElts = NumElems*SizeRatio;
35845 unsigned MaskNumElts = VT.getVectorNumElements();
35846 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35849 unsigned NumConcat = WidenNumElts / MaskNumElts;
35850 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35851 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35853 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
35856 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
35857 Mld->getBasePtr(), NewMask, WideSrc0,
35858 Mld->getMemoryVT(), Mld->getMemOperand(),
35860 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
35861 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
35864 /// If exactly one element of the mask is set for a non-truncating masked store,
35865 /// it is a vector extract and scalar store.
35866 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35867 /// mask have already been optimized in IR, so we don't bother with those here.
35868 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
35869 SelectionDAG &DAG) {
35870 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35871 // However, some target hooks may need to be added to know when the transform
35872 // is profitable. Endianness would also have to be considered.
35874 SDValue Addr, VecIndex;
35875 unsigned Alignment;
35876 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
35879 // Extract the one scalar element that is actually being stored.
35881 EVT VT = MS->getValue().getValueType();
35882 EVT EltVT = VT.getVectorElementType();
35883 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
35884 MS->getValue(), VecIndex);
35886 // Store that element at the appropriate offset from the base pointer.
35887 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
35888 Alignment, MS->getMemOperand()->getFlags());
35891 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
35892 const X86Subtarget &Subtarget) {
35893 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
35895 if (Mst->isCompressingStore())
35898 if (!Mst->isTruncatingStore()) {
35899 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
35900 return ScalarStore;
35902 // If the mask is checking (0 > X), we're creating a vector with all-zeros
35903 // or all-ones elements based on the sign bits of X. AVX1 masked store only
35904 // cares about the sign bit of each mask element, so eliminate the compare:
35905 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
35906 // Note that by waiting to match an x86-specific PCMPGT node, we're
35907 // eliminating potentially more complex matching of a setcc node which has
35908 // a full range of predicates.
35909 SDValue Mask = Mst->getMask();
35910 if (Mask.getOpcode() == X86ISD::PCMPGT &&
35911 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
35912 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
35913 "Unexpected type for PCMPGT");
35914 return DAG.getMaskedStore(
35915 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
35916 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
35919 // TODO: AVX512 targets should also be able to simplify something like the
35920 // pattern above, but that pattern will be different. It will either need to
35921 // match setcc more generally or match PCMPGTM later (in tablegen?).
35926 // Resolve truncating stores.
35927 EVT VT = Mst->getValue().getValueType();
35928 unsigned NumElems = VT.getVectorNumElements();
35929 EVT StVT = Mst->getMemoryVT();
35932 assert(StVT != VT && "Cannot truncate to the same type");
35933 unsigned FromSz = VT.getScalarSizeInBits();
35934 unsigned ToSz = StVT.getScalarSizeInBits();
35936 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35938 // The truncating store is legal in some cases. For example
35939 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
35940 // are designated for truncate store.
35941 // In this case we don't need any further transformations.
35942 if (TLI.isTruncStoreLegal(VT, StVT))
35945 // From/To sizes and ElemCount must be pow of two.
35946 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35947 "Unexpected size for truncating masked store");
35948 // We are going to use the original vector elt for storing.
35949 // Accumulated smaller vector elements must be a multiple of the store size.
35950 assert (((NumElems * FromSz) % ToSz) == 0 &&
35951 "Unexpected ratio for truncating masked store");
35953 unsigned SizeRatio = FromSz / ToSz;
35954 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
35956 // Create a type on which we perform the shuffle.
35957 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35958 StVT.getScalarType(), NumElems*SizeRatio);
35960 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35962 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
35963 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35964 for (unsigned i = 0; i != NumElems; ++i)
35965 ShuffleVec[i] = i * SizeRatio;
35967 // Can't shuffle using an illegal type.
35968 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35969 "WideVecVT should be legal");
35971 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
35972 DAG.getUNDEF(WideVecVT),
35976 SDValue Mask = Mst->getMask();
35977 if (Mask.getValueType() == VT) {
35978 // Mask and original value have the same type.
35979 NewMask = DAG.getBitcast(WideVecVT, Mask);
35980 for (unsigned i = 0; i != NumElems; ++i)
35981 ShuffleVec[i] = i * SizeRatio;
35982 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
35983 ShuffleVec[i] = NumElems*SizeRatio;
35984 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35985 DAG.getConstant(0, dl, WideVecVT),
35988 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35989 unsigned WidenNumElts = NumElems*SizeRatio;
35990 unsigned MaskNumElts = VT.getVectorNumElements();
35991 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35994 unsigned NumConcat = WidenNumElts / MaskNumElts;
35995 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35996 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35998 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
36001 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
36002 Mst->getBasePtr(), NewMask, StVT,
36003 Mst->getMemOperand(), false);
36006 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
36007 const X86Subtarget &Subtarget) {
36008 StoreSDNode *St = cast<StoreSDNode>(N);
36009 EVT VT = St->getValue().getValueType();
36010 EVT StVT = St->getMemoryVT();
36012 SDValue StoredVal = St->getOperand(1);
36013 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36015 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
36016 // This will avoid a copy to k-register.
36017 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
36018 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36019 StoredVal.getOperand(0).getValueType() == MVT::i8) {
36020 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
36021 St->getBasePtr(), St->getPointerInfo(),
36022 St->getAlignment(), St->getMemOperand()->getFlags());
36025 // Widen v2i1/v4i1 stores to v8i1.
36026 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
36027 Subtarget.hasAVX512()) {
36028 unsigned NumConcats = 8 / VT.getVectorNumElements();
36029 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
36030 Ops[0] = StoredVal;
36031 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
36032 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
36033 St->getPointerInfo(), St->getAlignment(),
36034 St->getMemOperand()->getFlags());
36037 // Turn vXi1 stores of constants into a scalar store.
36038 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
36039 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
36040 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
36041 // If its a v64i1 store without 64-bit support, we need two stores.
36042 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
36043 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
36044 StoredVal->ops().slice(0, 32));
36045 Lo = combinevXi1ConstantToInteger(Lo, DAG);
36046 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
36047 StoredVal->ops().slice(32, 32));
36048 Hi = combinevXi1ConstantToInteger(Hi, DAG);
36050 unsigned Alignment = St->getAlignment();
36052 SDValue Ptr0 = St->getBasePtr();
36053 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
36056 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
36057 Alignment, St->getMemOperand()->getFlags());
36059 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
36060 St->getPointerInfo().getWithOffset(4),
36061 MinAlign(Alignment, 4U),
36062 St->getMemOperand()->getFlags());
36063 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
36066 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
36067 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
36068 St->getPointerInfo(), St->getAlignment(),
36069 St->getMemOperand()->getFlags());
36072 // If we are saving a concatenation of two XMM registers and 32-byte stores
36073 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
36075 unsigned AddressSpace = St->getAddressSpace();
36076 unsigned Alignment = St->getAlignment();
36077 if (VT.is256BitVector() && StVT == VT &&
36078 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
36079 AddressSpace, Alignment, &Fast) &&
36081 unsigned NumElems = VT.getVectorNumElements();
36085 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
36086 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
36088 SDValue Ptr0 = St->getBasePtr();
36089 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
36092 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
36093 Alignment, St->getMemOperand()->getFlags());
36095 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
36096 St->getPointerInfo().getWithOffset(16),
36097 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
36098 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
36101 // Optimize trunc store (of multiple scalars) to shuffle and store.
36102 // First, pack all of the elements in one place. Next, store to memory
36103 // in fewer chunks.
36104 if (St->isTruncatingStore() && VT.isVector()) {
36105 // Check if we can detect an AVG pattern from the truncation. If yes,
36106 // replace the trunc store by a normal store with the result of X86ISD::AVG
36108 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
36110 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
36111 St->getPointerInfo(), St->getAlignment(),
36112 St->getMemOperand()->getFlags());
36114 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36116 detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
36118 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
36119 dl, Val, St->getBasePtr(),
36120 St->getMemoryVT(), St->getMemOperand(), DAG);
36121 if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
36122 DAG, dl, Subtarget, TLI))
36123 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
36124 dl, Val, St->getBasePtr(),
36125 St->getMemoryVT(), St->getMemOperand(), DAG);
36127 unsigned NumElems = VT.getVectorNumElements();
36128 assert(StVT != VT && "Cannot truncate to the same type");
36129 unsigned FromSz = VT.getScalarSizeInBits();
36130 unsigned ToSz = StVT.getScalarSizeInBits();
36132 // The truncating store is legal in some cases. For example
36133 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
36134 // are designated for truncate store.
36135 // In this case we don't need any further transformations.
36136 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
36139 // From, To sizes and ElemCount must be pow of two
36140 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
36141 // We are going to use the original vector elt for storing.
36142 // Accumulated smaller vector elements must be a multiple of the store size.
36143 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
36145 unsigned SizeRatio = FromSz / ToSz;
36147 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
36149 // Create a type on which we perform the shuffle
36150 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
36151 StVT.getScalarType(), NumElems*SizeRatio);
36153 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
36155 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
36156 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
36157 for (unsigned i = 0; i != NumElems; ++i)
36158 ShuffleVec[i] = i * SizeRatio;
36160 // Can't shuffle using an illegal type.
36161 if (!TLI.isTypeLegal(WideVecVT))
36164 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
36165 DAG.getUNDEF(WideVecVT),
36167 // At this point all of the data is stored at the bottom of the
36168 // register. We now need to save it to mem.
36170 // Find the largest store unit
36171 MVT StoreType = MVT::i8;
36172 for (MVT Tp : MVT::integer_valuetypes()) {
36173 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
36177 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
36178 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
36179 (64 <= NumElems * ToSz))
36180 StoreType = MVT::f64;
36182 // Bitcast the original vector into a vector of store-size units
36183 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
36184 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
36185 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
36186 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
36187 SmallVector<SDValue, 8> Chains;
36188 SDValue Ptr = St->getBasePtr();
36190 // Perform one or more big stores into memory.
36191 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
36192 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
36193 StoreType, ShuffWide,
36194 DAG.getIntPtrConstant(i, dl));
36196 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
36197 St->getAlignment(), St->getMemOperand()->getFlags());
36198 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
36199 Chains.push_back(Ch);
36202 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
36205 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
36206 // the FP state in cases where an emms may be missing.
36207 // A preferable solution to the general problem is to figure out the right
36208 // places to insert EMMS. This qualifies as a quick hack.
36210 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
36211 if (VT.getSizeInBits() != 64)
36214 const Function &F = DAG.getMachineFunction().getFunction();
36215 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
36217 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
36218 if ((VT.isVector() ||
36219 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
36220 isa<LoadSDNode>(St->getValue()) &&
36221 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
36222 St->getChain().hasOneUse() && !St->isVolatile()) {
36223 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
36224 SmallVector<SDValue, 8> Ops;
36226 if (!ISD::isNormalLoad(Ld))
36229 // If this is not the MMX case, i.e. we are just turning i64 load/store
36230 // into f64 load/store, avoid the transformation if there are multiple
36231 // uses of the loaded value.
36232 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
36237 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
36238 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
36240 if (Subtarget.is64Bit() || F64IsLegal) {
36241 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
36242 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
36243 Ld->getMemOperand());
36245 // Make sure new load is placed in same chain order.
36246 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
36247 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
36248 St->getMemOperand());
36251 // Otherwise, lower to two pairs of 32-bit loads / stores.
36252 SDValue LoAddr = Ld->getBasePtr();
36253 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
36255 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
36256 Ld->getPointerInfo(), Ld->getAlignment(),
36257 Ld->getMemOperand()->getFlags());
36258 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
36259 Ld->getPointerInfo().getWithOffset(4),
36260 MinAlign(Ld->getAlignment(), 4),
36261 Ld->getMemOperand()->getFlags());
36262 // Make sure new loads are placed in same chain order.
36263 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
36264 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
36266 LoAddr = St->getBasePtr();
36267 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
36270 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
36271 St->getAlignment(), St->getMemOperand()->getFlags());
36272 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
36273 St->getPointerInfo().getWithOffset(4),
36274 MinAlign(St->getAlignment(), 4),
36275 St->getMemOperand()->getFlags());
36276 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
36279 // This is similar to the above case, but here we handle a scalar 64-bit
36280 // integer store that is extracted from a vector on a 32-bit target.
36281 // If we have SSE2, then we can treat it like a floating-point double
36282 // to get past legalization. The execution dependencies fixup pass will
36283 // choose the optimal machine instruction for the store if this really is
36284 // an integer or v2f32 rather than an f64.
36285 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
36286 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
36287 SDValue OldExtract = St->getOperand(1);
36288 SDValue ExtOp0 = OldExtract.getOperand(0);
36289 unsigned VecSize = ExtOp0.getValueSizeInBits();
36290 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
36291 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
36292 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
36293 BitCast, OldExtract.getOperand(1));
36294 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
36295 St->getPointerInfo(), St->getAlignment(),
36296 St->getMemOperand()->getFlags());
36302 /// Return 'true' if this vector operation is "horizontal"
36303 /// and return the operands for the horizontal operation in LHS and RHS. A
36304 /// horizontal operation performs the binary operation on successive elements
36305 /// of its first operand, then on successive elements of its second operand,
36306 /// returning the resulting values in a vector. For example, if
36307 /// A = < float a0, float a1, float a2, float a3 >
36309 /// B = < float b0, float b1, float b2, float b3 >
36310 /// then the result of doing a horizontal operation on A and B is
36311 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
36312 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
36313 /// A horizontal-op B, for some already available A and B, and if so then LHS is
36314 /// set to A, RHS to B, and the routine returns 'true'.
36315 /// Note that the binary operation should have the property that if one of the
36316 /// operands is UNDEF then the result is UNDEF.
36317 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
36318 // Look for the following pattern: if
36319 // A = < float a0, float a1, float a2, float a3 >
36320 // B = < float b0, float b1, float b2, float b3 >
36322 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
36323 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
36324 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
36325 // which is A horizontal-op B.
36327 // At least one of the operands should be a vector shuffle.
36328 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
36329 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
36332 MVT VT = LHS.getSimpleValueType();
36334 assert((VT.is128BitVector() || VT.is256BitVector()) &&
36335 "Unsupported vector type for horizontal add/sub");
36337 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
36338 // operate independently on 128-bit lanes.
36339 unsigned NumElts = VT.getVectorNumElements();
36340 unsigned NumLanes = VT.getSizeInBits()/128;
36341 unsigned NumLaneElts = NumElts / NumLanes;
36342 assert((NumLaneElts % 2 == 0) &&
36343 "Vector type should have an even number of elements in each lane");
36344 unsigned HalfLaneElts = NumLaneElts/2;
36346 // View LHS in the form
36347 // LHS = VECTOR_SHUFFLE A, B, LMask
36348 // If LHS is not a shuffle then pretend it is the shuffle
36349 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
36350 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
36353 SmallVector<int, 16> LMask(NumElts);
36354 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36355 if (!LHS.getOperand(0).isUndef())
36356 A = LHS.getOperand(0);
36357 if (!LHS.getOperand(1).isUndef())
36358 B = LHS.getOperand(1);
36359 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
36360 std::copy(Mask.begin(), Mask.end(), LMask.begin());
36362 if (!LHS.isUndef())
36364 for (unsigned i = 0; i != NumElts; ++i)
36368 // Likewise, view RHS in the form
36369 // RHS = VECTOR_SHUFFLE C, D, RMask
36371 SmallVector<int, 16> RMask(NumElts);
36372 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36373 if (!RHS.getOperand(0).isUndef())
36374 C = RHS.getOperand(0);
36375 if (!RHS.getOperand(1).isUndef())
36376 D = RHS.getOperand(1);
36377 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
36378 std::copy(Mask.begin(), Mask.end(), RMask.begin());
36380 if (!RHS.isUndef())
36382 for (unsigned i = 0; i != NumElts; ++i)
36386 // Check that the shuffles are both shuffling the same vectors.
36387 if (!(A == C && B == D) && !(A == D && B == C))
36390 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
36391 if (!A.getNode() && !B.getNode())
36394 // If A and B occur in reverse order in RHS, then "swap" them (which means
36395 // rewriting the mask).
36397 ShuffleVectorSDNode::commuteMask(RMask);
36399 // At this point LHS and RHS are equivalent to
36400 // LHS = VECTOR_SHUFFLE A, B, LMask
36401 // RHS = VECTOR_SHUFFLE A, B, RMask
36402 // Check that the masks correspond to performing a horizontal operation.
36403 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
36404 for (unsigned i = 0; i != NumLaneElts; ++i) {
36405 int LIdx = LMask[i+l], RIdx = RMask[i+l];
36407 // Ignore any UNDEF components.
36408 if (LIdx < 0 || RIdx < 0 ||
36409 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
36410 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
36413 // Check that successive elements are being operated on. If not, this is
36414 // not a horizontal operation.
36415 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
36416 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
36417 if (!(LIdx == Index && RIdx == Index + 1) &&
36418 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
36423 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
36424 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
36428 /// Do target-specific dag combines on floating-point adds/subs.
36429 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
36430 const X86Subtarget &Subtarget) {
36431 EVT VT = N->getValueType(0);
36432 SDValue LHS = N->getOperand(0);
36433 SDValue RHS = N->getOperand(1);
36434 bool IsFadd = N->getOpcode() == ISD::FADD;
36435 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
36437 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
36438 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
36439 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
36440 isHorizontalBinOp(LHS, RHS, IsFadd)) {
36441 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
36442 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
36447 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
36449 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
36450 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
36451 const X86Subtarget &Subtarget,
36453 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
36454 SDValue Src = N->getOperand(0);
36455 unsigned Opcode = Src.getOpcode();
36456 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36458 EVT VT = N->getValueType(0);
36459 EVT SrcVT = Src.getValueType();
36461 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
36462 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
36464 // Repeated operand, so we are only trading one output truncation for
36465 // one input truncation.
36469 // See if either operand has been extended from a smaller/equal size to
36470 // the truncation size, allowing a truncation to combine with the extend.
36471 unsigned Opcode0 = Op0.getOpcode();
36472 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
36473 Opcode0 == ISD::ZERO_EXTEND) &&
36474 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36477 unsigned Opcode1 = Op1.getOpcode();
36478 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
36479 Opcode1 == ISD::ZERO_EXTEND) &&
36480 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36483 // See if either operand is a single use constant which can be constant
36485 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
36486 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
36487 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
36488 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
36491 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
36492 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
36493 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
36494 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
36497 // Don't combine if the operation has other uses.
36498 if (!N->isOnlyUserOf(Src.getNode()))
36501 // Only support vector truncation for now.
36502 // TODO: i64 scalar math would benefit as well.
36503 if (!VT.isVector())
36506 // In most cases its only worth pre-truncating if we're only facing the cost
36507 // of one truncation.
36508 // i.e. if one of the inputs will constant fold or the input is repeated.
36513 SDValue Op0 = Src.getOperand(0);
36514 SDValue Op1 = Src.getOperand(1);
36515 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
36516 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36517 return TruncateArithmetic(Op0, Op1);
36522 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
36523 // better to truncate if we have the chance.
36524 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
36525 !TLI.isOperationLegal(Opcode, SrcVT))
36526 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
36529 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
36530 SDValue Op0 = Src.getOperand(0);
36531 SDValue Op1 = Src.getOperand(1);
36532 if (TLI.isOperationLegal(Opcode, VT) &&
36533 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36534 return TruncateArithmetic(Op0, Op1);
36542 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
36543 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
36544 const X86Subtarget &Subtarget,
36545 SelectionDAG &DAG) {
36546 SDValue In = N->getOperand(0);
36547 EVT InVT = In.getValueType();
36548 EVT InSVT = InVT.getVectorElementType();
36549 EVT OutVT = N->getValueType(0);
36550 EVT OutSVT = OutVT.getVectorElementType();
36552 // Split a long vector into vectors of legal type and mask to unset all bits
36553 // that won't appear in the result to prevent saturation.
36554 // TODO - we should be doing this at the maximum legal size but this is
36555 // causing regressions where we're concatenating back to max width just to
36556 // perform the AND and then extracting back again.....
36557 unsigned NumSubRegs = InVT.getSizeInBits() / 128;
36558 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
36559 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
36560 SmallVector<SDValue, 8> SubVecs(NumSubRegs);
36563 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
36564 SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
36566 for (unsigned i = 0; i < NumSubRegs; i++) {
36567 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
36568 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
36569 SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
36571 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
36573 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
36576 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
36577 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
36578 const X86Subtarget &Subtarget,
36579 SelectionDAG &DAG) {
36580 SDValue In = N->getOperand(0);
36581 EVT InVT = In.getValueType();
36582 EVT OutVT = N->getValueType(0);
36583 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
36584 DAG.getValueType(OutVT));
36585 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
36588 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
36589 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
36590 /// legalization the truncation will be translated into a BUILD_VECTOR with each
36591 /// element that is extracted from a vector and then truncated, and it is
36592 /// difficult to do this optimization based on them.
36593 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
36594 const X86Subtarget &Subtarget) {
36595 EVT OutVT = N->getValueType(0);
36596 if (!OutVT.isVector())
36599 SDValue In = N->getOperand(0);
36600 if (!In.getValueType().isSimple())
36603 EVT InVT = In.getValueType();
36604 unsigned NumElems = OutVT.getVectorNumElements();
36606 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
36607 // SSE2, and we need to take care of it specially.
36608 // AVX512 provides vpmovdb.
36609 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
36612 EVT OutSVT = OutVT.getVectorElementType();
36613 EVT InSVT = InVT.getVectorElementType();
36614 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
36615 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
36619 // SSSE3's pshufb results in less instructions in the cases below.
36620 if (Subtarget.hasSSSE3() && NumElems == 8 &&
36621 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
36622 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
36626 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
36627 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
36628 // truncate 2 x v4i32 to v8i16.
36629 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
36630 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
36631 if (InSVT == MVT::i32)
36632 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
36637 /// This function transforms vector truncation of 'extended sign-bits' or
36638 /// 'extended zero-bits' values.
36639 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
36640 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
36642 const X86Subtarget &Subtarget) {
36643 // Requires SSE2 but AVX512 has fast truncate.
36644 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
36647 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
36650 SDValue In = N->getOperand(0);
36651 if (!In.getValueType().isSimple())
36654 MVT VT = N->getValueType(0).getSimpleVT();
36655 MVT SVT = VT.getScalarType();
36657 MVT InVT = In.getValueType().getSimpleVT();
36658 MVT InSVT = InVT.getScalarType();
36660 // Check we have a truncation suited for PACKSS/PACKUS.
36661 if (!VT.is128BitVector() && !VT.is256BitVector())
36663 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
36665 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
36668 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
36669 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
36671 // Use PACKUS if the input has zero-bits that extend all the way to the
36672 // packed/truncated value. e.g. masks, zext_in_reg, etc.
36674 DAG.computeKnownBits(In, Known);
36675 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
36676 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
36677 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
36679 // Use PACKSS if the input has sign-bits that extend all the way to the
36680 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
36681 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
36682 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
36683 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
36688 // Try to form a MULHU or MULHS node by looking for
36689 // (trunc (srl (mul ext, ext), 16))
36690 // TODO: This is X86 specific because we want to be able to handle wide types
36691 // before type legalization. But we can only do it if the vector will be
36692 // legalized via widening/splitting. Type legalization can't handle promotion
36693 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
36695 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
36696 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
36697 // First instruction should be a right shift of a multiply.
36698 if (Src.getOpcode() != ISD::SRL ||
36699 Src.getOperand(0).getOpcode() != ISD::MUL)
36702 if (!Subtarget.hasSSE2())
36705 // Only handle vXi16 types that are at least 128-bits.
36706 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
36707 VT.getVectorNumElements() < 8)
36710 // Input type should be vXi32.
36711 EVT InVT = Src.getValueType();
36712 if (InVT.getVectorElementType() != MVT::i32)
36715 // Need a shift by 16.
36717 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
36721 SDValue LHS = Src.getOperand(0).getOperand(0);
36722 SDValue RHS = Src.getOperand(0).getOperand(1);
36724 unsigned ExtOpc = LHS.getOpcode();
36725 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
36726 RHS.getOpcode() != ExtOpc)
36729 // Peek through the extends.
36730 LHS = LHS.getOperand(0);
36731 RHS = RHS.getOperand(0);
36733 // Ensure the input types match.
36734 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
36737 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
36738 return DAG.getNode(Opc, DL, VT, LHS, RHS);
36741 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
36742 const X86Subtarget &Subtarget) {
36743 EVT VT = N->getValueType(0);
36744 SDValue Src = N->getOperand(0);
36747 // Attempt to pre-truncate inputs to arithmetic ops instead.
36748 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
36751 // Try to detect AVG pattern first.
36752 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
36755 // Try to combine truncation with signed/unsigned saturation.
36756 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
36759 // Try to combine PMULHUW/PMULHW for vXi16.
36760 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
36763 // The bitcast source is a direct mmx result.
36764 // Detect bitcasts between i32 to x86mmx
36765 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
36766 SDValue BCSrc = Src.getOperand(0);
36767 if (BCSrc.getValueType() == MVT::x86mmx)
36768 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
36771 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
36772 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
36775 return combineVectorTruncation(N, DAG, Subtarget);
36778 /// Returns the negated value if the node \p N flips sign of FP value.
36780 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
36781 /// AVX512F does not have FXOR, so FNEG is lowered as
36782 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
36783 /// In this case we go though all bitcasts.
36784 static SDValue isFNEG(SDNode *N) {
36785 if (N->getOpcode() == ISD::FNEG)
36786 return N->getOperand(0);
36788 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
36789 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
36792 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
36793 if (!Op1.getValueType().isFloatingPoint())
36796 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
36798 unsigned EltBits = Op1.getScalarValueSizeInBits();
36799 auto isSignMask = [&](const ConstantFP *C) {
36800 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
36803 // There is more than one way to represent the same constant on
36804 // the different X86 targets. The type of the node may also depend on size.
36805 // - load scalar value and broadcast
36806 // - BUILD_VECTOR node
36807 // - load from a constant pool.
36808 // We check all variants here.
36809 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
36810 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
36811 if (isSignMask(cast<ConstantFP>(C)))
36814 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
36815 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
36816 if (isSignMask(CN->getConstantFPValue()))
36819 } else if (auto *C = getTargetConstantFromNode(Op1)) {
36820 if (C->getType()->isVectorTy()) {
36821 if (auto *SplatV = C->getSplatValue())
36822 if (isSignMask(cast<ConstantFP>(SplatV)))
36824 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
36825 if (isSignMask(FPConst))
36831 /// Do target-specific dag combines on floating point negations.
36832 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
36833 const X86Subtarget &Subtarget) {
36834 EVT OrigVT = N->getValueType(0);
36835 SDValue Arg = isFNEG(N);
36836 assert(Arg.getNode() && "N is expected to be an FNEG node");
36838 EVT VT = Arg.getValueType();
36839 EVT SVT = VT.getScalarType();
36842 // Let legalize expand this if it isn't a legal type yet.
36843 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36846 // If we're negating a FMUL node on a target with FMA, then we can avoid the
36847 // use of a constant by performing (-0 - A*B) instead.
36848 // FIXME: Check rounding control flags as well once it becomes available.
36849 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
36850 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
36851 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
36852 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
36853 Arg.getOperand(1), Zero);
36854 return DAG.getBitcast(OrigVT, NewNode);
36857 // If we're negating an FMA node, then we can adjust the
36858 // instruction to include the extra negation.
36859 unsigned NewOpcode = 0;
36860 if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
36861 switch (Arg.getOpcode()) {
36862 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
36863 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
36864 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
36865 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
36866 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
36867 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
36868 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
36869 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
36870 // We can't handle scalar intrinsic node here because it would only
36871 // invert one element and not the whole vector. But we could try to handle
36872 // a negation of the lower element only.
36876 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
36877 Arg.getNode()->ops()));
36882 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
36883 const X86Subtarget &Subtarget) {
36884 MVT VT = N->getSimpleValueType(0);
36885 // If we have integer vector types available, use the integer opcodes.
36886 if (VT.isVector() && Subtarget.hasSSE2()) {
36889 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
36891 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
36892 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
36893 unsigned IntOpcode;
36894 switch (N->getOpcode()) {
36895 default: llvm_unreachable("Unexpected FP logic op");
36896 case X86ISD::FOR: IntOpcode = ISD::OR; break;
36897 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
36898 case X86ISD::FAND: IntOpcode = ISD::AND; break;
36899 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
36901 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
36902 return DAG.getBitcast(VT, IntOp);
36908 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
36909 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
36910 if (N->getOpcode() != ISD::XOR)
36913 SDValue LHS = N->getOperand(0);
36914 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
36915 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
36918 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
36919 X86::CondCode(LHS->getConstantOperandVal(0)));
36921 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
36924 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
36925 TargetLowering::DAGCombinerInfo &DCI,
36926 const X86Subtarget &Subtarget) {
36927 // If this is SSE1 only convert to FXOR to avoid scalarization.
36928 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
36929 N->getValueType(0) == MVT::v4i32) {
36930 return DAG.getBitcast(
36931 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
36932 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
36933 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
36936 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
36939 if (DCI.isBeforeLegalizeOps())
36942 if (SDValue SetCC = foldXor1SetCC(N, DAG))
36945 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
36948 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
36952 return combineFneg(N, DAG, Subtarget);
36956 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
36957 TargetLowering::DAGCombinerInfo &DCI,
36958 const X86Subtarget &Subtarget) {
36959 SDValue Op0 = N->getOperand(0);
36960 SDValue Op1 = N->getOperand(1);
36961 EVT VT = N->getValueType(0);
36962 unsigned NumBits = VT.getSizeInBits();
36964 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36965 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36966 !DCI.isBeforeLegalizeOps());
36968 // TODO - Constant Folding.
36969 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36970 // Reduce Cst1 to the bottom 16-bits.
36971 // NOTE: SimplifyDemandedBits won't do this for constants.
36972 const APInt &Val1 = Cst1->getAPIntValue();
36973 APInt MaskedVal1 = Val1 & 0xFFFF;
36974 if (MaskedVal1 != Val1)
36975 return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
36976 DAG.getConstant(MaskedVal1, SDLoc(N), VT));
36979 // Only bottom 16-bits of the control bits are required.
36981 APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
36982 if (TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) {
36983 DCI.CommitTargetLoweringOpt(TLO);
36984 return SDValue(N, 0);
36990 static bool isNullFPScalarOrVectorConst(SDValue V) {
36991 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
36994 /// If a value is a scalar FP zero or a vector FP zero (potentially including
36995 /// undefined elements), return a zero constant that may be used to fold away
36996 /// that value. In the case of a vector, the returned constant will not contain
36997 /// undefined elements even if the input parameter does. This makes it suitable
36998 /// to be used as a replacement operand with operations (eg, bitwise-and) where
36999 /// an undef should not propagate.
37000 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
37001 const X86Subtarget &Subtarget) {
37002 if (!isNullFPScalarOrVectorConst(V))
37005 if (V.getValueType().isVector())
37006 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
37011 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
37012 const X86Subtarget &Subtarget) {
37013 SDValue N0 = N->getOperand(0);
37014 SDValue N1 = N->getOperand(1);
37015 EVT VT = N->getValueType(0);
37018 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
37019 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
37020 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
37021 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
37024 auto isAllOnesConstantFP = [](SDValue V) {
37025 if (V.getSimpleValueType().isVector())
37026 return ISD::isBuildVectorAllOnes(V.getNode());
37027 auto *C = dyn_cast<ConstantFPSDNode>(V);
37028 return C && C->getConstantFPValue()->isAllOnesValue();
37031 // fand (fxor X, -1), Y --> fandn X, Y
37032 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
37033 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
37035 // fand X, (fxor Y, -1) --> fandn Y, X
37036 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
37037 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
37042 /// Do target-specific dag combines on X86ISD::FAND nodes.
37043 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
37044 const X86Subtarget &Subtarget) {
37045 // FAND(0.0, x) -> 0.0
37046 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
37049 // FAND(x, 0.0) -> 0.0
37050 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
37053 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
37056 return lowerX86FPLogicOp(N, DAG, Subtarget);
37059 /// Do target-specific dag combines on X86ISD::FANDN nodes.
37060 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
37061 const X86Subtarget &Subtarget) {
37062 // FANDN(0.0, x) -> x
37063 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
37064 return N->getOperand(1);
37066 // FANDN(x, 0.0) -> 0.0
37067 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
37070 return lowerX86FPLogicOp(N, DAG, Subtarget);
37073 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
37074 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
37075 const X86Subtarget &Subtarget) {
37076 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
37078 // F[X]OR(0.0, x) -> x
37079 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
37080 return N->getOperand(1);
37082 // F[X]OR(x, 0.0) -> x
37083 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
37084 return N->getOperand(0);
37087 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
37090 return lowerX86FPLogicOp(N, DAG, Subtarget);
37093 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
37094 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
37095 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
37097 // Only perform optimizations if UnsafeMath is used.
37098 if (!DAG.getTarget().Options.UnsafeFPMath)
37101 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
37102 // into FMINC and FMAXC, which are Commutative operations.
37103 unsigned NewOp = 0;
37104 switch (N->getOpcode()) {
37105 default: llvm_unreachable("unknown opcode");
37106 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
37107 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
37110 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
37111 N->getOperand(0), N->getOperand(1));
37114 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
37115 const X86Subtarget &Subtarget) {
37116 if (Subtarget.useSoftFloat())
37119 // TODO: If an operand is already known to be a NaN or not a NaN, this
37120 // should be an optional swap and FMAX/FMIN.
37122 EVT VT = N->getValueType(0);
37123 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
37124 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
37125 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
37128 SDValue Op0 = N->getOperand(0);
37129 SDValue Op1 = N->getOperand(1);
37131 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
37133 // If we don't have to respect NaN inputs, this is a direct translation to x86
37134 // min/max instructions.
37135 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
37136 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
37138 // If we have to respect NaN inputs, this takes at least 3 instructions.
37139 // Favor a library call when operating on a scalar and minimizing code size.
37140 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
37143 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
37144 DAG.getDataLayout(), *DAG.getContext(), VT);
37146 // There are 4 possibilities involving NaN inputs, and these are the required
37150 // ----------------
37151 // Num | Max | Op0 |
37152 // Op0 ----------------
37153 // NaN | Op1 | NaN |
37154 // ----------------
37156 // The SSE FP max/min instructions were not designed for this case, but rather
37158 // Min = Op1 < Op0 ? Op1 : Op0
37159 // Max = Op1 > Op0 ? Op1 : Op0
37161 // So they always return Op0 if either input is a NaN. However, we can still
37162 // use those instructions for fmaxnum by selecting away a NaN input.
37164 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
37165 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
37166 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
37168 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
37169 // are NaN, the NaN value of Op1 is the result.
37170 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
37173 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
37174 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
37175 TargetLowering::DAGCombinerInfo &DCI,
37176 const X86Subtarget &Subtarget) {
37177 // ANDNP(0, x) -> x
37178 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
37179 return N->getOperand(1);
37181 // ANDNP(x, 0) -> 0
37182 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
37183 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
37185 EVT VT = N->getValueType(0);
37187 // Attempt to recursively combine a bitmask ANDNP with shuffles.
37188 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
37190 if (SDValue Res = combineX86ShufflesRecursively(
37191 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
37192 /*HasVarMask*/ false, DAG, Subtarget))
37199 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
37200 TargetLowering::DAGCombinerInfo &DCI) {
37201 SDValue N0 = N->getOperand(0);
37202 SDValue N1 = N->getOperand(1);
37204 // BT ignores high bits in the bit index operand.
37205 unsigned BitWidth = N1.getValueSizeInBits();
37206 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
37207 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
37208 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
37213 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
37214 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
37215 EVT VT = N->getValueType(0);
37217 SDValue N0 = N->getOperand(0);
37218 SDValue N1 = N->getOperand(1);
37219 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37221 if (ExtraVT != MVT::i16)
37224 // Look through single use any_extends.
37225 if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
37226 N0 = N0.getOperand(0);
37228 // See if we have a single use cmov.
37229 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
37232 SDValue CMovOp0 = N0.getOperand(0);
37233 SDValue CMovOp1 = N0.getOperand(1);
37235 // Make sure both operands are constants.
37236 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37237 !isa<ConstantSDNode>(CMovOp1.getNode()))
37242 // If we looked through an any_extend above, add one to the constants.
37243 if (N0.getValueType() != VT) {
37244 CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
37245 CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
37248 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
37249 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
37251 return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
37252 N0.getOperand(2), N0.getOperand(3));
37255 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
37256 const X86Subtarget &Subtarget) {
37257 if (SDValue V = combineSextInRegCmov(N, DAG))
37260 EVT VT = N->getValueType(0);
37261 SDValue N0 = N->getOperand(0);
37262 SDValue N1 = N->getOperand(1);
37263 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37266 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
37267 // both SSE and AVX2 since there is no sign-extended shift right
37268 // operation on a vector with 64-bit elements.
37269 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
37270 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
37271 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
37272 N0.getOpcode() == ISD::SIGN_EXTEND)) {
37273 SDValue N00 = N0.getOperand(0);
37275 // EXTLOAD has a better solution on AVX2,
37276 // it may be replaced with X86ISD::VSEXT node.
37277 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
37278 if (!ISD::isNormalLoad(N00.getNode()))
37281 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
37282 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
37284 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
37290 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
37291 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
37292 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
37293 /// opportunities to combine math ops, use an LEA, or use a complex addressing
37294 /// mode. This can eliminate extend, add, and shift instructions.
37295 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
37296 const X86Subtarget &Subtarget) {
37297 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
37298 Ext->getOpcode() != ISD::ZERO_EXTEND)
37301 // TODO: This should be valid for other integer types.
37302 EVT VT = Ext->getValueType(0);
37303 if (VT != MVT::i64)
37306 SDValue Add = Ext->getOperand(0);
37307 if (Add.getOpcode() != ISD::ADD)
37310 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
37311 bool NSW = Add->getFlags().hasNoSignedWrap();
37312 bool NUW = Add->getFlags().hasNoUnsignedWrap();
37314 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
37316 if ((Sext && !NSW) || (!Sext && !NUW))
37319 // Having a constant operand to the 'add' ensures that we are not increasing
37320 // the instruction count because the constant is extended for free below.
37321 // A constant operand can also become the displacement field of an LEA.
37322 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
37326 // Don't make the 'add' bigger if there's no hope of combining it with some
37327 // other 'add' or 'shl' instruction.
37328 // TODO: It may be profitable to generate simpler LEA instructions in place
37329 // of single 'add' instructions, but the cost model for selecting an LEA
37330 // currently has a high threshold.
37331 bool HasLEAPotential = false;
37332 for (auto *User : Ext->uses()) {
37333 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
37334 HasLEAPotential = true;
37338 if (!HasLEAPotential)
37341 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
37342 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
37343 SDValue AddOp0 = Add.getOperand(0);
37344 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
37345 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
37347 // The wider add is guaranteed to not wrap because both operands are
37350 Flags.setNoSignedWrap(NSW);
37351 Flags.setNoUnsignedWrap(NUW);
37352 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
37355 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
37356 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
37357 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
37358 /// extends from AH (which we otherwise need to do contortions to access).
37359 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
37360 SDValue N0 = N->getOperand(0);
37361 auto OpcodeN = N->getOpcode();
37362 auto OpcodeN0 = N0.getOpcode();
37363 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
37364 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
37367 EVT VT = N->getValueType(0);
37368 EVT InVT = N0.getValueType();
37369 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
37370 !(VT == MVT::i32 || VT == MVT::i64))
37373 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
37374 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
37375 : X86ISD::UDIVREM8_ZEXT_HREG;
37376 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
37378 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
37379 // If this was a 64-bit extend, complete it.
37380 if (VT == MVT::i64)
37381 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
37382 return R.getValue(1);
37385 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
37386 // operands and the result of CMOV is not used anywhere else - promote CMOV
37387 // itself instead of promoting its result. This could be beneficial, because:
37388 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
37389 // (or more) pseudo-CMOVs only when they go one-after-another and
37390 // getting rid of result extension code after CMOV will help that.
37391 // 2) Promotion of constant CMOV arguments is free, hence the
37392 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
37393 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
37394 // promotion is also good in terms of code-size.
37395 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
37397 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
37398 SDValue CMovN = Extend->getOperand(0);
37399 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
37402 EVT TargetVT = Extend->getValueType(0);
37403 unsigned ExtendOpcode = Extend->getOpcode();
37406 EVT VT = CMovN.getValueType();
37407 SDValue CMovOp0 = CMovN.getOperand(0);
37408 SDValue CMovOp1 = CMovN.getOperand(1);
37410 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37411 !isa<ConstantSDNode>(CMovOp1.getNode()))
37414 // Only extend to i32 or i64.
37415 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
37418 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
37420 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
37423 // If this a zero extend to i64, we should only extend to i32 and use a free
37424 // zero extend to finish.
37425 EVT ExtendVT = TargetVT;
37426 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
37427 ExtendVT = MVT::i32;
37429 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
37430 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
37432 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
37433 CMovN.getOperand(2), CMovN.getOperand(3));
37435 // Finish extending if needed.
37436 if (ExtendVT != TargetVT)
37437 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
37442 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
37443 // This is more or less the reverse of combineBitcastvxi1.
37445 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
37446 TargetLowering::DAGCombinerInfo &DCI,
37447 const X86Subtarget &Subtarget) {
37448 unsigned Opcode = N->getOpcode();
37449 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
37450 Opcode != ISD::ANY_EXTEND)
37452 if (!DCI.isBeforeLegalizeOps())
37454 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
37457 SDValue N0 = N->getOperand(0);
37458 EVT VT = N->getValueType(0);
37459 EVT SVT = VT.getScalarType();
37460 EVT InSVT = N0.getValueType().getScalarType();
37461 unsigned EltSizeInBits = SVT.getSizeInBits();
37463 // Input type must be extending a bool vector (bit-casted from a scalar
37464 // integer) to legal integer types.
37465 if (!VT.isVector())
37467 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
37469 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
37472 SDValue N00 = N0.getOperand(0);
37473 EVT SclVT = N0.getOperand(0).getValueType();
37474 if (!SclVT.isScalarInteger())
37479 SmallVector<int, 32> ShuffleMask;
37480 unsigned NumElts = VT.getVectorNumElements();
37481 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
37483 // Broadcast the scalar integer to the vector elements.
37484 if (NumElts > EltSizeInBits) {
37485 // If the scalar integer is greater than the vector element size, then we
37486 // must split it down into sub-sections for broadcasting. For example:
37487 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
37488 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
37489 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
37490 unsigned Scale = NumElts / EltSizeInBits;
37492 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
37493 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
37494 Vec = DAG.getBitcast(VT, Vec);
37496 for (unsigned i = 0; i != Scale; ++i)
37497 ShuffleMask.append(EltSizeInBits, i);
37499 // For smaller scalar integers, we can simply any-extend it to the vector
37500 // element size (we don't care about the upper bits) and broadcast it to all
37502 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
37503 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
37504 ShuffleMask.append(NumElts, 0);
37506 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
37508 // Now, mask the relevant bit in each element.
37509 SmallVector<SDValue, 32> Bits;
37510 for (unsigned i = 0; i != NumElts; ++i) {
37511 int BitIdx = (i % EltSizeInBits);
37512 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
37513 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
37515 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
37516 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
37518 // Compare against the bitmask and extend the result.
37519 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
37520 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
37521 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
37523 // For SEXT, this is now done, otherwise shift the result down for
37525 if (Opcode == ISD::SIGN_EXTEND)
37527 return DAG.getNode(ISD::SRL, DL, VT, Vec,
37528 DAG.getConstant(EltSizeInBits - 1, DL, VT));
37531 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
37532 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
37533 /// with UNDEFs) of the input to vectors of the same size as the target type
37534 /// which then extends the lowest elements.
37535 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
37536 TargetLowering::DAGCombinerInfo &DCI,
37537 const X86Subtarget &Subtarget) {
37538 unsigned Opcode = N->getOpcode();
37539 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
37541 if (!DCI.isBeforeLegalizeOps())
37543 if (!Subtarget.hasSSE2())
37546 SDValue N0 = N->getOperand(0);
37547 EVT VT = N->getValueType(0);
37548 EVT SVT = VT.getScalarType();
37549 EVT InVT = N0.getValueType();
37550 EVT InSVT = InVT.getScalarType();
37552 // Input type must be a vector and we must be extending legal integer types.
37553 if (!VT.isVector())
37555 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
37557 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
37560 // On AVX2+ targets, if the input/output types are both legal then we will be
37561 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
37562 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
37563 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
37568 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
37569 EVT InVT = N.getValueType();
37570 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
37571 Size / InVT.getScalarSizeInBits());
37572 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
37573 DAG.getUNDEF(InVT));
37575 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
37578 // If target-size is less than 128-bits, extend to a type that would extend
37579 // to 128 bits, extend that and extract the original target vector.
37580 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
37581 unsigned Scale = 128 / VT.getSizeInBits();
37583 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
37584 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
37585 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
37586 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
37587 DAG.getIntPtrConstant(0, DL));
37590 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
37591 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
37592 // Also use this if we don't have SSE41 to allow the legalizer do its job.
37593 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
37594 (VT.is256BitVector() && Subtarget.hasInt256()) ||
37595 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
37596 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
37597 return Opcode == ISD::SIGN_EXTEND
37598 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
37599 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
37602 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
37603 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
37604 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
37605 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
37606 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
37608 SmallVector<SDValue, 8> Opnds;
37609 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
37610 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
37611 DAG.getIntPtrConstant(Offset, DL));
37612 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
37613 SrcVec = Opcode == ISD::SIGN_EXTEND
37614 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
37615 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
37616 Opnds.push_back(SrcVec);
37618 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
37621 // On pre-AVX2 targets, split into 128-bit nodes of
37622 // ISD::*_EXTEND_VECTOR_INREG.
37623 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
37624 return SplitAndExtendInReg(128);
37626 // On pre-AVX512 targets, split into 256-bit nodes of
37627 // ISD::*_EXTEND_VECTOR_INREG.
37628 if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
37629 return SplitAndExtendInReg(256);
37634 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
37636 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
37637 const X86Subtarget &Subtarget) {
37638 SDValue N0 = N->getOperand(0);
37639 EVT VT = N->getValueType(0);
37642 // Only do this combine with AVX512 for vector extends.
37643 if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
37646 // Only combine legal element types.
37647 EVT SVT = VT.getVectorElementType();
37648 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
37649 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
37652 // We can only do this if the vector size in 256 bits or less.
37653 unsigned Size = VT.getSizeInBits();
37657 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
37658 // that's the only integer compares with we have.
37659 ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
37660 if (ISD::isUnsignedIntSetCC(CC))
37663 // Only do this combine if the extension will be fully consumed by the setcc.
37664 EVT N00VT = N0.getOperand(0).getValueType();
37665 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
37666 if (Size != MatchingVecType.getSizeInBits())
37669 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
37671 if (N->getOpcode() == ISD::ZERO_EXTEND)
37672 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
37677 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
37678 TargetLowering::DAGCombinerInfo &DCI,
37679 const X86Subtarget &Subtarget) {
37680 SDValue N0 = N->getOperand(0);
37681 EVT VT = N->getValueType(0);
37682 EVT InVT = N0.getValueType();
37685 if (SDValue DivRem8 = getDivRem8(N, DAG))
37688 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
37691 if (!DCI.isBeforeLegalizeOps())
37694 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
37697 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
37698 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
37699 // Invert and sign-extend a boolean is the same as zero-extend and subtract
37700 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
37701 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
37702 // sext (xor Bool, -1) --> sub (zext Bool), 1
37703 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
37704 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
37707 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
37710 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
37714 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
37717 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
37723 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
37726 default: llvm_unreachable("Unexpected opcode");
37727 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
37728 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
37729 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
37730 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
37731 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
37732 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
37733 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
37734 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
37740 default: llvm_unreachable("Unexpected opcode");
37741 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
37742 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
37743 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
37744 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
37745 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
37746 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
37747 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
37748 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
37755 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
37756 const X86Subtarget &Subtarget) {
37758 EVT VT = N->getValueType(0);
37760 // Let legalize expand this if it isn't a legal type yet.
37761 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
37764 EVT ScalarVT = VT.getScalarType();
37765 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
37768 SDValue A = N->getOperand(0);
37769 SDValue B = N->getOperand(1);
37770 SDValue C = N->getOperand(2);
37772 auto invertIfNegative = [&DAG](SDValue &V) {
37773 if (SDValue NegVal = isFNEG(V.getNode())) {
37774 V = DAG.getBitcast(V.getValueType(), NegVal);
37777 // Look through extract_vector_elts. If it comes from an FNEG, create a
37778 // new extract from the FNEG input.
37779 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
37780 isa<ConstantSDNode>(V.getOperand(1)) &&
37781 cast<ConstantSDNode>(V.getOperand(1))->getZExtValue() == 0) {
37782 if (SDValue NegVal = isFNEG(V.getOperand(0).getNode())) {
37783 NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
37784 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
37785 NegVal, V.getOperand(1));
37793 // Do not convert the passthru input of scalar intrinsics.
37794 // FIXME: We could allow negations of the lower element only.
37795 bool NegA = invertIfNegative(A);
37796 bool NegB = invertIfNegative(B);
37797 bool NegC = invertIfNegative(C);
37799 if (!NegA && !NegB && !NegC)
37802 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
37804 if (N->getNumOperands() == 4)
37805 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
37806 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
37809 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
37810 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
37811 const X86Subtarget &Subtarget) {
37813 EVT VT = N->getValueType(0);
37815 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
37819 unsigned NewOpcode;
37820 switch (N->getOpcode()) {
37821 default: llvm_unreachable("Unexpected opcode!");
37822 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
37823 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
37824 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
37825 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
37828 if (N->getNumOperands() == 4)
37829 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37830 NegVal, N->getOperand(3));
37831 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37835 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
37836 TargetLowering::DAGCombinerInfo &DCI,
37837 const X86Subtarget &Subtarget) {
37838 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
37839 // (and (i32 x86isd::setcc_carry), 1)
37840 // This eliminates the zext. This transformation is necessary because
37841 // ISD::SETCC is always legalized to i8.
37843 SDValue N0 = N->getOperand(0);
37844 EVT VT = N->getValueType(0);
37846 if (N0.getOpcode() == ISD::AND &&
37848 N0.getOperand(0).hasOneUse()) {
37849 SDValue N00 = N0.getOperand(0);
37850 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
37851 if (!isOneConstant(N0.getOperand(1)))
37853 return DAG.getNode(ISD::AND, dl, VT,
37854 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
37855 N00.getOperand(0), N00.getOperand(1)),
37856 DAG.getConstant(1, dl, VT));
37860 if (N0.getOpcode() == ISD::TRUNCATE &&
37862 N0.getOperand(0).hasOneUse()) {
37863 SDValue N00 = N0.getOperand(0);
37864 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
37865 return DAG.getNode(ISD::AND, dl, VT,
37866 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
37867 N00.getOperand(0), N00.getOperand(1)),
37868 DAG.getConstant(1, dl, VT));
37872 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
37875 if (DCI.isBeforeLegalizeOps())
37876 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
37879 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
37882 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
37886 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
37889 if (SDValue DivRem8 = getDivRem8(N, DAG))
37892 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
37895 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
37901 /// Try to map a 128-bit or larger integer comparison to vector instructions
37902 /// before type legalization splits it up into chunks.
37903 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
37904 const X86Subtarget &Subtarget) {
37905 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
37906 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
37908 // We're looking for an oversized integer equality comparison.
37909 SDValue X = SetCC->getOperand(0);
37910 SDValue Y = SetCC->getOperand(1);
37911 EVT OpVT = X.getValueType();
37912 unsigned OpSize = OpVT.getSizeInBits();
37913 if (!OpVT.isScalarInteger() || OpSize < 128)
37916 // Ignore a comparison with zero because that gets special treatment in
37917 // EmitTest(). But make an exception for the special case of a pair of
37918 // logically-combined vector-sized operands compared to zero. This pattern may
37919 // be generated by the memcmp expansion pass with oversized integer compares
37921 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
37922 X.getOperand(0).getOpcode() == ISD::XOR &&
37923 X.getOperand(1).getOpcode() == ISD::XOR;
37924 if (isNullConstant(Y) && !IsOrXorXorCCZero)
37927 // Bail out if we know that this is not really just an oversized integer.
37928 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
37929 peekThroughBitcasts(Y).getValueType() == MVT::f128)
37932 // TODO: Use PXOR + PTEST for SSE4.1 or later?
37933 // TODO: Add support for AVX-512.
37934 EVT VT = SetCC->getValueType(0);
37936 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
37937 (OpSize == 256 && Subtarget.hasAVX2())) {
37938 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
37940 if (IsOrXorXorCCZero) {
37941 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
37942 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
37943 // Use 2 vector equality compares and 'and' the results before doing a
37945 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
37946 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
37947 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
37948 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
37949 SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
37950 SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
37951 Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
37953 SDValue VecX = DAG.getBitcast(VecVT, X);
37954 SDValue VecY = DAG.getBitcast(VecVT, Y);
37955 Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
37957 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
37958 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
37959 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
37960 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
37961 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
37962 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
37963 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
37965 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
37971 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
37972 const X86Subtarget &Subtarget) {
37973 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
37974 SDValue LHS = N->getOperand(0);
37975 SDValue RHS = N->getOperand(1);
37976 EVT VT = N->getValueType(0);
37977 EVT OpVT = LHS.getValueType();
37980 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
37981 // 0-x == y --> x+y == 0
37982 // 0-x != y --> x+y != 0
37983 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
37985 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
37986 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
37988 // x == 0-y --> x+y == 0
37989 // x != 0-y --> x+y != 0
37990 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
37992 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
37993 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
37996 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
38000 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
38001 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
38002 // Put build_vectors on the right.
38003 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
38004 std::swap(LHS, RHS);
38005 CC = ISD::getSetCCSwappedOperands(CC);
38009 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
38010 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
38011 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
38013 if (IsSEXT0 && IsVZero1) {
38014 assert(VT == LHS.getOperand(0).getValueType() &&
38015 "Uexpected operand type");
38016 if (CC == ISD::SETGT)
38017 return DAG.getConstant(0, DL, VT);
38018 if (CC == ISD::SETLE)
38019 return DAG.getConstant(1, DL, VT);
38020 if (CC == ISD::SETEQ || CC == ISD::SETGE)
38021 return DAG.getNOT(DL, LHS.getOperand(0), VT);
38023 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
38024 "Unexpected condition code!");
38025 return LHS.getOperand(0);
38029 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
38030 // pre-promote its result type since vXi1 vectors don't get promoted
38031 // during type legalization.
38032 // NOTE: The element count check is to ignore operand types that need to
38033 // go through type promotion to a 128-bit vector.
38034 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
38035 VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
38036 (OpVT.getVectorElementType() == MVT::i8 ||
38037 OpVT.getVectorElementType() == MVT::i16)) {
38038 SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
38040 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
38043 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
38044 // to avoid scalarization via legalization because v4i32 is not a legal type.
38045 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
38046 LHS.getValueType() == MVT::v4f32)
38047 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
38052 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
38053 TargetLowering::DAGCombinerInfo &DCI) {
38054 SDValue Src = N->getOperand(0);
38055 MVT SrcVT = Src.getSimpleValueType();
38057 // Perform constant folding.
38058 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
38059 assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
38061 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
38062 SDValue In = Src.getOperand(Idx);
38063 if (!In.isUndef() &&
38064 cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
38067 return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
38070 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38071 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38072 !DCI.isBeforeLegalizeOps());
38074 // MOVMSK only uses the MSB from each vector element.
38076 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
38077 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
38078 DCI.AddToWorklist(Src.getNode());
38079 DCI.CommitTargetLoweringOpt(TLO);
38080 return SDValue(N, 0);
38086 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
38087 TargetLowering::DAGCombinerInfo &DCI,
38088 const X86Subtarget &Subtarget) {
38091 if (DCI.isBeforeLegalizeOps()) {
38092 SDValue Index = N->getOperand(4);
38093 // Remove any sign extends from 32 or smaller to larger than 32.
38094 // Only do this before LegalizeOps in case we need the sign extend for
38096 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
38097 if (Index.getScalarValueSizeInBits() > 32 &&
38098 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
38099 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38100 NewOps[4] = Index.getOperand(0);
38101 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
38103 // The original sign extend has less users, add back to worklist in
38104 // case it needs to be removed
38105 DCI.AddToWorklist(Index.getNode());
38106 DCI.AddToWorklist(N);
38108 return SDValue(Res, 0);
38112 // Make sure the index is either i32 or i64
38113 unsigned ScalarSize = Index.getScalarValueSizeInBits();
38114 if (ScalarSize != 32 && ScalarSize != 64) {
38115 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
38116 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
38117 Index.getValueType().getVectorNumElements());
38118 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
38119 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38121 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
38123 DCI.AddToWorklist(N);
38124 return SDValue(Res, 0);
38127 // Try to remove zero extends from 32->64 if we know the sign bit of
38128 // the input is zero.
38129 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
38130 Index.getScalarValueSizeInBits() == 64 &&
38131 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
38132 if (DAG.SignBitIsZero(Index.getOperand(0))) {
38133 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38134 NewOps[4] = Index.getOperand(0);
38135 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
38137 // The original sign extend has less users, add back to worklist in
38138 // case it needs to be removed
38139 DCI.AddToWorklist(Index.getNode());
38140 DCI.AddToWorklist(N);
38142 return SDValue(Res, 0);
38147 // With AVX2 we only demand the upper bit of the mask.
38148 if (!Subtarget.hasAVX512()) {
38149 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38150 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38151 !DCI.isBeforeLegalizeOps());
38152 SDValue Mask = N->getOperand(2);
38154 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
38155 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
38156 DCI.AddToWorklist(Mask.getNode());
38157 DCI.CommitTargetLoweringOpt(TLO);
38158 return SDValue(N, 0);
38165 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
38166 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
38167 const X86Subtarget &Subtarget) {
38169 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
38170 SDValue EFLAGS = N->getOperand(1);
38172 // Try to simplify the EFLAGS and condition code operands.
38173 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
38174 return getSETCC(CC, Flags, DL, DAG);
38179 /// Optimize branch condition evaluation.
38180 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
38181 const X86Subtarget &Subtarget) {
38183 SDValue EFLAGS = N->getOperand(3);
38184 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
38186 // Try to simplify the EFLAGS and condition code operands.
38187 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
38188 // RAUW them under us.
38189 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
38190 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
38191 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
38192 N->getOperand(1), Cond, Flags);
38198 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
38199 SelectionDAG &DAG) {
38200 // Take advantage of vector comparisons producing 0 or -1 in each lane to
38201 // optimize away operation when it's from a constant.
38203 // The general transformation is:
38204 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
38205 // AND(VECTOR_CMP(x,y), constant2)
38206 // constant2 = UNARYOP(constant)
38208 // Early exit if this isn't a vector operation, the operand of the
38209 // unary operation isn't a bitwise AND, or if the sizes of the operations
38210 // aren't the same.
38211 EVT VT = N->getValueType(0);
38212 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
38213 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
38214 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
38217 // Now check that the other operand of the AND is a constant. We could
38218 // make the transformation for non-constant splats as well, but it's unclear
38219 // that would be a benefit as it would not eliminate any operations, just
38220 // perform one more step in scalar code before moving to the vector unit.
38221 if (BuildVectorSDNode *BV =
38222 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
38223 // Bail out if the vector isn't a constant.
38224 if (!BV->isConstant())
38227 // Everything checks out. Build up the new and improved node.
38229 EVT IntVT = BV->getValueType(0);
38230 // Create a new constant of the appropriate type for the transformed
38232 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
38233 // The AND node needs bitcasts to/from an integer vector type around it.
38234 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
38235 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
38236 N->getOperand(0)->getOperand(0), MaskConst);
38237 SDValue Res = DAG.getBitcast(VT, NewAnd);
38244 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
38245 const X86Subtarget &Subtarget) {
38246 SDValue Op0 = N->getOperand(0);
38247 EVT VT = N->getValueType(0);
38248 EVT InVT = Op0.getValueType();
38250 // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38251 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
38252 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
38253 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38255 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38256 InVT.getVectorNumElements());
38257 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
38259 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
38260 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38263 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
38264 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
38265 // the optimization here.
38266 if (DAG.SignBitIsZero(Op0))
38267 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
38272 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
38273 const X86Subtarget &Subtarget) {
38274 // First try to optimize away the conversion entirely when it's
38275 // conditionally from a constant. Vectors only.
38276 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
38279 // Now move on to more general possibilities.
38280 SDValue Op0 = N->getOperand(0);
38281 EVT VT = N->getValueType(0);
38282 EVT InVT = Op0.getValueType();
38284 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38285 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
38286 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
38287 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38289 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38290 InVT.getVectorNumElements());
38291 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
38292 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38295 // Without AVX512DQ we only support i64 to float scalar conversion. For both
38296 // vectors and scalars, see if we know that the upper bits are all the sign
38297 // bit, in which case we can truncate the input to i32 and convert from that.
38298 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
38299 unsigned BitWidth = InVT.getScalarSizeInBits();
38300 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
38301 if (NumSignBits >= (BitWidth - 31)) {
38302 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
38303 if (InVT.isVector())
38304 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
38305 InVT.getVectorNumElements());
38307 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
38308 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
38312 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
38313 // a 32-bit target where SSE doesn't support i64->FP operations.
38314 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
38315 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
38316 EVT LdVT = Ld->getValueType(0);
38318 // This transformation is not supported if the result type is f16 or f128.
38319 if (VT == MVT::f16 || VT == MVT::f128)
38322 // If we have AVX512DQ we can use packed conversion instructions unless
38324 if (Subtarget.hasDQI() && VT != MVT::f80)
38327 if (!Ld->isVolatile() && !VT.isVector() &&
38328 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
38329 !Subtarget.is64Bit() && LdVT == MVT::i64) {
38330 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
38331 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
38332 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
38339 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
38340 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38341 MVT VT = N->getSimpleValueType(0);
38342 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38343 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
38344 N->getOperand(0), N->getOperand(1),
38351 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
38352 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
38353 TargetLowering::DAGCombinerInfo &DCI) {
38354 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
38355 // the result is either zero or one (depending on the input carry bit).
38356 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
38357 if (X86::isZeroNode(N->getOperand(0)) &&
38358 X86::isZeroNode(N->getOperand(1)) &&
38359 // We don't have a good way to replace an EFLAGS use, so only do this when
38361 SDValue(N, 1).use_empty()) {
38363 EVT VT = N->getValueType(0);
38364 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
38365 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
38366 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38367 DAG.getConstant(X86::COND_B, DL,
38370 DAG.getConstant(1, DL, VT));
38371 return DCI.CombineTo(N, Res1, CarryOut);
38374 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38375 MVT VT = N->getSimpleValueType(0);
38376 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38377 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
38378 N->getOperand(0), N->getOperand(1),
38385 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
38386 /// which is more useful than 0/1 in some cases.
38387 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
38389 // "Condition code B" is also known as "the carry flag" (CF).
38390 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
38391 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
38392 MVT VT = N->getSimpleValueType(0);
38394 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
38396 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
38397 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
38400 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
38401 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
38402 /// with CMP+{ADC, SBB}.
38403 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
38404 bool IsSub = N->getOpcode() == ISD::SUB;
38405 SDValue X = N->getOperand(0);
38406 SDValue Y = N->getOperand(1);
38408 // If this is an add, canonicalize a zext operand to the RHS.
38409 // TODO: Incomplete? What if both sides are zexts?
38410 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
38411 Y.getOpcode() != ISD::ZERO_EXTEND)
38414 // Look through a one-use zext.
38415 bool PeekedThroughZext = false;
38416 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
38417 Y = Y.getOperand(0);
38418 PeekedThroughZext = true;
38421 // If this is an add, canonicalize a setcc operand to the RHS.
38422 // TODO: Incomplete? What if both sides are setcc?
38423 // TODO: Should we allow peeking through a zext of the other operand?
38424 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
38425 Y.getOpcode() != X86ISD::SETCC)
38428 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
38432 EVT VT = N->getValueType(0);
38433 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
38435 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38436 // the general case below.
38437 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
38439 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
38440 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
38441 // This is a complicated way to get -1 or 0 from the carry flag:
38442 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38443 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38444 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38445 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38449 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
38450 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
38451 SDValue EFLAGS = Y->getOperand(1);
38452 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38453 EFLAGS.getValueType().isInteger() &&
38454 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38455 // Swap the operands of a SUB, and we have the same pattern as above.
38456 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
38457 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
38458 SDValue NewSub = DAG.getNode(
38459 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
38460 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38461 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38462 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38463 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38469 if (CC == X86::COND_B) {
38470 // X + SETB Z --> X + (mask SBB Z, Z)
38471 // X - SETB Z --> X - (mask SBB Z, Z)
38472 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
38473 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
38474 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38475 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38476 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38479 if (CC == X86::COND_A) {
38480 SDValue EFLAGS = Y->getOperand(1);
38481 // Try to convert COND_A into COND_B in an attempt to facilitate
38482 // materializing "setb reg".
38484 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
38485 // cannot take an immediate as its first operand.
38487 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38488 EFLAGS.getValueType().isInteger() &&
38489 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38490 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
38491 EFLAGS.getNode()->getVTList(),
38492 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38493 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38494 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
38495 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38496 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38497 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38501 if (CC != X86::COND_E && CC != X86::COND_NE)
38504 SDValue Cmp = Y.getOperand(1);
38505 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
38506 !X86::isZeroNode(Cmp.getOperand(1)) ||
38507 !Cmp.getOperand(0).getValueType().isInteger())
38510 SDValue Z = Cmp.getOperand(0);
38511 EVT ZVT = Z.getValueType();
38513 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38514 // the general case below.
38516 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
38518 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
38519 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
38520 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
38521 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
38522 SDValue Zero = DAG.getConstant(0, DL, ZVT);
38523 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
38524 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
38525 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38526 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38527 SDValue(Neg.getNode(), 1));
38530 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
38531 // with fake operands:
38532 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
38533 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
38534 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
38535 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
38536 SDValue One = DAG.getConstant(1, DL, ZVT);
38537 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38538 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38539 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
38543 // (cmp Z, 1) sets the carry flag if Z is 0.
38544 SDValue One = DAG.getConstant(1, DL, ZVT);
38545 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38547 // Add the flags type for ADC/SBB nodes.
38548 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38550 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
38551 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
38552 if (CC == X86::COND_NE)
38553 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
38554 DAG.getConstant(-1ULL, DL, VT), Cmp1);
38556 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
38557 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
38558 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
38559 DAG.getConstant(0, DL, VT), Cmp1);
38562 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
38563 const X86Subtarget &Subtarget) {
38564 if (!Subtarget.hasSSE2())
38567 SDValue MulOp = N->getOperand(0);
38568 SDValue Phi = N->getOperand(1);
38570 if (MulOp.getOpcode() != ISD::MUL)
38571 std::swap(MulOp, Phi);
38572 if (MulOp.getOpcode() != ISD::MUL)
38576 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
38579 EVT VT = N->getValueType(0);
38581 // If the vector size is less than 128, or greater than the supported RegSize,
38582 // do not use PMADD.
38583 if (VT.getVectorNumElements() < 8)
38587 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38588 VT.getVectorNumElements());
38589 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38590 VT.getVectorNumElements() / 2);
38592 // Shrink the operands of mul.
38593 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
38594 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
38596 // Madd vector size is half of the original vector size
38597 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38598 ArrayRef<SDValue> Ops) {
38599 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
38600 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
38602 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
38604 // Fill the rest of the output with 0
38605 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
38606 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
38607 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
38610 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
38611 const X86Subtarget &Subtarget) {
38612 if (!Subtarget.hasSSE2())
38616 EVT VT = N->getValueType(0);
38617 SDValue Op0 = N->getOperand(0);
38618 SDValue Op1 = N->getOperand(1);
38620 // TODO: There's nothing special about i32, any integer type above i16 should
38621 // work just as well.
38622 if (!VT.isVector() || !VT.isSimple() ||
38623 !(VT.getVectorElementType() == MVT::i32))
38626 unsigned RegSize = 128;
38627 if (Subtarget.useBWIRegs())
38629 else if (Subtarget.hasAVX())
38632 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
38633 // TODO: We should be able to handle larger vectors by splitting them before
38634 // feeding them into several SADs, and then reducing over those.
38635 if (VT.getSizeInBits() / 4 > RegSize)
38638 // We know N is a reduction add, which means one of its operands is a phi.
38639 // To match SAD, we need the other operand to be a vector select.
38640 SDValue SelectOp, Phi;
38641 if (Op0.getOpcode() == ISD::VSELECT) {
38644 } else if (Op1.getOpcode() == ISD::VSELECT) {
38650 // Check whether we have an abs-diff pattern feeding into the select.
38651 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
38654 // SAD pattern detected. Now build a SAD instruction and an addition for
38655 // reduction. Note that the number of elements of the result of SAD is less
38656 // than the number of elements of its input. Therefore, we could only update
38657 // part of elements in the reduction vector.
38658 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
38660 // The output of PSADBW is a vector of i64.
38661 // We need to turn the vector of i64 into a vector of i32.
38662 // If the reduction vector is at least as wide as the psadbw result, just
38663 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
38665 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
38666 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
38667 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
38669 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
38671 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
38672 // Fill the upper elements with zero to match the add width.
38673 SDValue Zero = DAG.getConstant(0, DL, VT);
38674 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
38675 DAG.getIntPtrConstant(0, DL));
38678 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
38681 /// Convert vector increment or decrement to sub/add with an all-ones constant:
38682 /// add X, <1, 1...> --> sub X, <-1, -1...>
38683 /// sub X, <1, 1...> --> add X, <-1, -1...>
38684 /// The all-ones vector constant can be materialized using a pcmpeq instruction
38685 /// that is commonly recognized as an idiom (has no register dependency), so
38686 /// that's better/smaller than loading a splat 1 constant.
38687 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
38688 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
38689 "Unexpected opcode for increment/decrement transform");
38691 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
38692 // out and wait for legalization if we have an unsupported vector length.
38693 EVT VT = N->getValueType(0);
38694 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
38697 SDNode *N1 = N->getOperand(1).getNode();
38699 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
38700 !SplatVal.isOneValue())
38703 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
38704 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
38705 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
38708 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
38709 const SDLoc &DL, EVT VT,
38710 const X86Subtarget &Subtarget) {
38711 // Example of pattern we try to detect:
38712 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
38713 //(add (build_vector (extract_elt t, 0),
38714 // (extract_elt t, 2),
38715 // (extract_elt t, 4),
38716 // (extract_elt t, 6)),
38717 // (build_vector (extract_elt t, 1),
38718 // (extract_elt t, 3),
38719 // (extract_elt t, 5),
38720 // (extract_elt t, 7)))
38722 if (!Subtarget.hasSSE2())
38725 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
38726 Op1.getOpcode() != ISD::BUILD_VECTOR)
38729 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
38730 VT.getVectorNumElements() < 4 ||
38731 !isPowerOf2_32(VT.getVectorNumElements()))
38734 // Check if one of Op0,Op1 is of the form:
38735 // (build_vector (extract_elt Mul, 0),
38736 // (extract_elt Mul, 2),
38737 // (extract_elt Mul, 4),
38739 // the other is of the form:
38740 // (build_vector (extract_elt Mul, 1),
38741 // (extract_elt Mul, 3),
38742 // (extract_elt Mul, 5),
38744 // and identify Mul.
38746 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
38747 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
38748 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
38749 // TODO: Be more tolerant to undefs.
38750 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38751 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38752 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38753 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
38755 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
38756 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
38757 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
38758 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
38759 if (!Const0L || !Const1L || !Const0H || !Const1H)
38761 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
38762 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
38763 // Commutativity of mul allows factors of a product to reorder.
38765 std::swap(Idx0L, Idx1L);
38767 std::swap(Idx0H, Idx1H);
38768 // Commutativity of add allows pairs of factors to reorder.
38769 if (Idx0L > Idx0H) {
38770 std::swap(Idx0L, Idx0H);
38771 std::swap(Idx1L, Idx1H);
38773 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
38774 Idx1H != 2 * i + 3)
38777 // First time an extract_elt's source vector is visited. Must be a MUL
38778 // with 2X number of vector elements than the BUILD_VECTOR.
38779 // Both extracts must be from same MUL.
38780 Mul = Op0L->getOperand(0);
38781 if (Mul->getOpcode() != ISD::MUL ||
38782 Mul.getValueType().getVectorNumElements() != 2 * e)
38785 // Check that the extract is from the same MUL previously seen.
38786 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
38787 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
38791 // Check if the Mul source can be safely shrunk.
38793 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
38796 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38797 ArrayRef<SDValue> Ops) {
38798 // Shrink by adding truncate nodes and let DAGCombine fold with the
38800 EVT InVT = Ops[0].getValueType();
38801 assert(InVT.getScalarType() == MVT::i32 &&
38802 "Unexpected scalar element type");
38803 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
38804 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38805 InVT.getVectorNumElements() / 2);
38806 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38807 InVT.getVectorNumElements());
38808 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
38809 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
38810 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
38812 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
38813 { Mul.getOperand(0), Mul.getOperand(1) },
38817 // Attempt to turn this pattern into PMADDWD.
38818 // (mul (add (zext (build_vector)), (zext (build_vector))),
38819 // (add (zext (build_vector)), (zext (build_vector)))
38820 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
38821 const SDLoc &DL, EVT VT,
38822 const X86Subtarget &Subtarget) {
38823 if (!Subtarget.hasSSE2())
38826 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
38829 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
38830 VT.getVectorNumElements() < 4 ||
38831 !isPowerOf2_32(VT.getVectorNumElements()))
38834 SDValue N00 = N0.getOperand(0);
38835 SDValue N01 = N0.getOperand(1);
38836 SDValue N10 = N1.getOperand(0);
38837 SDValue N11 = N1.getOperand(1);
38839 // All inputs need to be sign extends.
38840 // TODO: Support ZERO_EXTEND from known positive?
38841 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
38842 N01.getOpcode() != ISD::SIGN_EXTEND ||
38843 N10.getOpcode() != ISD::SIGN_EXTEND ||
38844 N11.getOpcode() != ISD::SIGN_EXTEND)
38847 // Peek through the extends.
38848 N00 = N00.getOperand(0);
38849 N01 = N01.getOperand(0);
38850 N10 = N10.getOperand(0);
38851 N11 = N11.getOperand(0);
38853 // Must be extending from vXi16.
38854 EVT InVT = N00.getValueType();
38855 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
38856 N10.getValueType() != InVT || N11.getValueType() != InVT)
38859 // All inputs should be build_vectors.
38860 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
38861 N01.getOpcode() != ISD::BUILD_VECTOR ||
38862 N10.getOpcode() != ISD::BUILD_VECTOR ||
38863 N11.getOpcode() != ISD::BUILD_VECTOR)
38866 // For each element, we need to ensure we have an odd element from one vector
38867 // multiplied by the odd element of another vector and the even element from
38868 // one of the same vectors being multiplied by the even element from the
38869 // other vector. So we need to make sure for each element i, this operator
38870 // is being performed:
38871 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
38873 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
38874 SDValue N00Elt = N00.getOperand(i);
38875 SDValue N01Elt = N01.getOperand(i);
38876 SDValue N10Elt = N10.getOperand(i);
38877 SDValue N11Elt = N11.getOperand(i);
38878 // TODO: Be more tolerant to undefs.
38879 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38880 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38881 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38882 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
38884 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
38885 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
38886 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
38887 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
38888 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
38890 unsigned IdxN00 = ConstN00Elt->getZExtValue();
38891 unsigned IdxN01 = ConstN01Elt->getZExtValue();
38892 unsigned IdxN10 = ConstN10Elt->getZExtValue();
38893 unsigned IdxN11 = ConstN11Elt->getZExtValue();
38894 // Add is commutative so indices can be reordered.
38895 if (IdxN00 > IdxN10) {
38896 std::swap(IdxN00, IdxN10);
38897 std::swap(IdxN01, IdxN11);
38899 // N0 indices be the even elemtn. N1 indices must be the next odd element.
38900 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
38901 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
38903 SDValue N00In = N00Elt.getOperand(0);
38904 SDValue N01In = N01Elt.getOperand(0);
38905 SDValue N10In = N10Elt.getOperand(0);
38906 SDValue N11In = N11Elt.getOperand(0);
38907 // First time we find an input capture it.
38912 // Mul is commutative so the input vectors can be in any order.
38913 // Canonicalize to make the compares easier.
38915 std::swap(N00In, N01In);
38917 std::swap(N10In, N11In);
38918 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
38922 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38923 ArrayRef<SDValue> Ops) {
38924 // Shrink by adding truncate nodes and let DAGCombine fold with the
38926 EVT InVT = Ops[0].getValueType();
38927 assert(InVT.getScalarType() == MVT::i16 &&
38928 "Unexpected scalar element type");
38929 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
38930 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38931 InVT.getVectorNumElements() / 2);
38932 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
38934 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
38938 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
38939 const X86Subtarget &Subtarget) {
38940 const SDNodeFlags Flags = N->getFlags();
38941 if (Flags.hasVectorReduction()) {
38942 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
38944 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
38947 EVT VT = N->getValueType(0);
38948 SDValue Op0 = N->getOperand(0);
38949 SDValue Op1 = N->getOperand(1);
38951 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
38953 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
38956 // Try to synthesize horizontal adds from adds of shuffles.
38957 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
38958 VT == MVT::v8i32) &&
38959 Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
38960 auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38961 ArrayRef<SDValue> Ops) {
38962 return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
38964 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
38968 if (SDValue V = combineIncDecVector(N, DAG))
38971 return combineAddOrSubToADCOrSBB(N, DAG);
38974 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
38975 const X86Subtarget &Subtarget) {
38976 SDValue Op0 = N->getOperand(0);
38977 SDValue Op1 = N->getOperand(1);
38978 EVT VT = N->getValueType(0);
38980 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
38981 // is only worth it with SSSE3 (PSHUFB).
38982 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
38983 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
38984 !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
38985 !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
38986 VT == MVT::v16i32 || VT == MVT::v8i64)))
38989 SDValue SubusLHS, SubusRHS;
38990 // Try to find umax(a,b) - b or a - umin(a,b) patterns
38991 // they may be converted to subus(a,b).
38992 // TODO: Need to add IR canonicalization for this code.
38993 if (Op0.getOpcode() == ISD::UMAX) {
38995 SDValue MaxLHS = Op0.getOperand(0);
38996 SDValue MaxRHS = Op0.getOperand(1);
38999 else if (MaxRHS == Op1)
39003 } else if (Op1.getOpcode() == ISD::UMIN) {
39005 SDValue MinLHS = Op1.getOperand(0);
39006 SDValue MinRHS = Op1.getOperand(1);
39009 else if (MinRHS == Op0)
39016 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39017 ArrayRef<SDValue> Ops) {
39018 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
39021 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
39022 // special preprocessing in some cases.
39023 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
39024 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
39025 { SubusLHS, SubusRHS }, SUBUSBuilder);
39027 // Special preprocessing case can be only applied
39028 // if the value was zero extended from 16 bit,
39029 // so we require first 16 bits to be zeros for 32 bit
39030 // values, or first 48 bits for 64 bit values.
39032 DAG.computeKnownBits(SubusLHS, Known);
39033 unsigned NumZeros = Known.countMinLeadingZeros();
39034 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
39037 EVT ExtType = SubusLHS.getValueType();
39039 if (VT == MVT::v8i32 || VT == MVT::v8i64)
39040 ShrinkedType = MVT::v8i16;
39042 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
39044 // If SubusLHS is zeroextended - truncate SubusRHS to it's
39045 // size SubusRHS = umin(0xFFF.., SubusRHS).
39046 SDValue SaturationConst =
39047 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
39048 ShrinkedType.getScalarSizeInBits()),
39049 SDLoc(SubusLHS), ExtType);
39050 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
39052 SDValue NewSubusLHS =
39053 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
39054 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
39056 SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
39057 { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
39058 // Zero extend the result, it may be used somewhere as 32 bit,
39059 // if not zext and following trunc will shrink.
39060 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
39063 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
39064 const X86Subtarget &Subtarget) {
39065 SDValue Op0 = N->getOperand(0);
39066 SDValue Op1 = N->getOperand(1);
39068 // X86 can't encode an immediate LHS of a sub. See if we can push the
39069 // negation into a preceding instruction.
39070 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
39071 // If the RHS of the sub is a XOR with one use and a constant, invert the
39072 // immediate. Then add one to the LHS of the sub so we can turn
39073 // X-Y -> X+~Y+1, saving one register.
39074 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
39075 isa<ConstantSDNode>(Op1.getOperand(1))) {
39076 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
39077 EVT VT = Op0.getValueType();
39078 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
39080 DAG.getConstant(~XorC, SDLoc(Op1), VT));
39081 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
39082 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
39086 // Try to synthesize horizontal subs from subs of shuffles.
39087 EVT VT = N->getValueType(0);
39088 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
39089 VT == MVT::v8i32) &&
39090 Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
39091 auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39092 ArrayRef<SDValue> Ops) {
39093 return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
39095 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
39099 if (SDValue V = combineIncDecVector(N, DAG))
39102 // Try to create PSUBUS if SUB's argument is max/min
39103 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
39106 return combineAddOrSubToADCOrSBB(N, DAG);
39109 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
39110 TargetLowering::DAGCombinerInfo &DCI,
39111 const X86Subtarget &Subtarget) {
39112 if (DCI.isBeforeLegalize())
39116 unsigned Opcode = N->getOpcode();
39117 MVT VT = N->getSimpleValueType(0);
39118 MVT SVT = VT.getVectorElementType();
39119 unsigned NumElts = VT.getVectorNumElements();
39120 unsigned EltSizeInBits = SVT.getSizeInBits();
39122 SDValue Op = N->getOperand(0);
39123 MVT OpVT = Op.getSimpleValueType();
39124 MVT OpEltVT = OpVT.getVectorElementType();
39125 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
39126 unsigned InputBits = OpEltSizeInBits * NumElts;
39128 // Perform any constant folding.
39129 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
39131 SmallVector<APInt, 64> EltBits;
39132 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
39133 APInt Undefs(NumElts, 0);
39134 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
39136 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
39137 for (unsigned i = 0; i != NumElts; ++i) {
39138 if (UndefElts[i]) {
39142 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
39143 : EltBits[i].sextOrTrunc(EltSizeInBits);
39145 return getConstVector(Vals, Undefs, VT, DAG, DL);
39148 // (vzext (bitcast (vzext (x)) -> (vzext x)
39149 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
39150 SDValue V = peekThroughBitcasts(Op);
39151 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
39152 MVT InnerVT = V.getSimpleValueType();
39153 MVT InnerEltVT = InnerVT.getVectorElementType();
39155 // If the element sizes match exactly, we can just do one larger vzext. This
39156 // is always an exact type match as vzext operates on integer types.
39157 if (OpEltVT == InnerEltVT) {
39158 assert(OpVT == InnerVT && "Types must match for vzext!");
39159 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
39162 // The only other way we can combine them is if only a single element of the
39163 // inner vzext is used in the input to the outer vzext.
39164 if (InnerEltVT.getSizeInBits() < InputBits)
39167 // In this case, the inner vzext is completely dead because we're going to
39168 // only look at bits inside of the low element. Just do the outer vzext on
39169 // a bitcast of the input to the inner.
39170 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
39173 // Check if we can bypass extracting and re-inserting an element of an input
39174 // vector. Essentially:
39175 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
39176 // TODO: Add X86ISD::VSEXT support
39177 if (Opcode == X86ISD::VZEXT &&
39178 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39179 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39180 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
39181 SDValue ExtractedV = V.getOperand(0);
39182 SDValue OrigV = ExtractedV.getOperand(0);
39183 if (isNullConstant(ExtractedV.getOperand(1))) {
39184 MVT OrigVT = OrigV.getSimpleValueType();
39185 // Extract a subvector if necessary...
39186 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
39187 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
39188 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
39189 OrigVT.getVectorNumElements() / Ratio);
39190 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
39191 DAG.getIntPtrConstant(0, DL));
39193 Op = DAG.getBitcast(OpVT, OrigV);
39194 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
39201 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
39202 const X86Subtarget &Subtarget) {
39203 MVT VT = N->getSimpleValueType(0);
39206 if (N->getOperand(0) == N->getOperand(1)) {
39207 if (N->getOpcode() == X86ISD::PCMPEQ)
39208 return getOnesVector(VT, DAG, DL);
39209 if (N->getOpcode() == X86ISD::PCMPGT)
39210 return getZeroVector(VT, Subtarget, DAG, DL);
39216 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
39217 TargetLowering::DAGCombinerInfo &DCI,
39218 const X86Subtarget &Subtarget) {
39219 if (DCI.isBeforeLegalizeOps())
39222 MVT OpVT = N->getSimpleValueType(0);
39224 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
39227 SDValue Vec = N->getOperand(0);
39228 SDValue SubVec = N->getOperand(1);
39230 unsigned IdxVal = N->getConstantOperandVal(2);
39231 MVT SubVecVT = SubVec.getSimpleValueType();
39233 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
39234 // Inserting zeros into zeros is a nop.
39235 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39236 return getZeroVector(OpVT, Subtarget, DAG, dl);
39238 // If we're inserting into a zero vector and then into a larger zero vector,
39239 // just insert into the larger zero vector directly.
39240 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39241 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
39242 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
39243 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39244 getZeroVector(OpVT, Subtarget, DAG, dl),
39245 SubVec.getOperand(1),
39246 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
39249 // If we're inserting into a zero vector and our input was extracted from an
39250 // insert into a zero vector of the same type and the extraction was at
39251 // least as large as the original insertion. Just insert the original
39252 // subvector into a zero vector.
39253 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
39254 SubVec.getConstantOperandVal(1) == 0 &&
39255 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
39256 SDValue Ins = SubVec.getOperand(0);
39257 if (Ins.getConstantOperandVal(2) == 0 &&
39258 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
39259 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
39260 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39261 getZeroVector(OpVT, Subtarget, DAG, dl),
39262 Ins.getOperand(1), N->getOperand(2));
39265 // If we're inserting a bitcast into zeros, rewrite the insert and move the
39266 // bitcast to the other side. This helps with detecting zero extending
39268 // TODO: Is this useful for other indices than 0?
39269 if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
39270 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
39271 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
39272 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
39273 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
39274 DAG.getBitcast(NewVT, Vec),
39275 SubVec.getOperand(0), N->getOperand(2));
39276 return DAG.getBitcast(OpVT, Insert);
39280 // Stop here if this is an i1 vector.
39284 // If this is an insert of an extract, combine to a shuffle. Don't do this
39285 // if the insert or extract can be represented with a subregister operation.
39286 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39287 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
39288 (IdxVal != 0 || !Vec.isUndef())) {
39289 int ExtIdxVal = SubVec.getConstantOperandVal(1);
39290 if (ExtIdxVal != 0) {
39291 int VecNumElts = OpVT.getVectorNumElements();
39292 int SubVecNumElts = SubVecVT.getVectorNumElements();
39293 SmallVector<int, 64> Mask(VecNumElts);
39294 // First create an identity shuffle mask.
39295 for (int i = 0; i != VecNumElts; ++i)
39297 // Now insert the extracted portion.
39298 for (int i = 0; i != SubVecNumElts; ++i)
39299 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
39301 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
39305 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
39307 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39308 // (load16 addr + 16), Elts/2)
39311 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39312 // (load32 addr + 32), Elts/2)
39314 // or a 16-byte or 32-byte broadcast:
39315 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39316 // (load16 addr), Elts/2)
39317 // --> X86SubVBroadcast(load16 addr)
39319 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39320 // (load32 addr), Elts/2)
39321 // --> X86SubVBroadcast(load32 addr)
39322 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
39323 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39324 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
39325 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
39326 if (Idx2 && Idx2->getZExtValue() == 0) {
39327 SDValue SubVec2 = Vec.getOperand(1);
39328 // If needed, look through bitcasts to get to the load.
39329 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
39331 unsigned Alignment = FirstLd->getAlignment();
39332 unsigned AS = FirstLd->getAddressSpace();
39333 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
39334 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
39335 OpVT, AS, Alignment, &Fast) && Fast) {
39336 SDValue Ops[] = {SubVec2, SubVec};
39337 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
39342 // If lower/upper loads are the same and the only users of the load, then
39343 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
39344 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
39345 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
39346 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
39347 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
39349 // If this is subv_broadcast insert into both halves, use a larger
39351 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
39352 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
39353 SubVec.getOperand(0));
39355 // If we're inserting all zeros into the upper half, change this to
39356 // an insert into an all zeros vector. We will match this to a move
39357 // with implicit upper bit zeroing during isel.
39358 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39359 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39360 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
39361 Vec.getOperand(2));
39363 // If we are inserting into both halves of the vector, the starting
39364 // vector should be undef. If it isn't, make it so. Only do this if the
39365 // the early insert has no other uses.
39366 // TODO: Should this be a generic DAG combine?
39367 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
39368 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
39369 SubVec2, Vec.getOperand(2));
39370 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
39380 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
39381 TargetLowering::DAGCombinerInfo &DCI,
39382 const X86Subtarget &Subtarget) {
39383 if (DCI.isBeforeLegalizeOps())
39386 MVT OpVT = N->getSimpleValueType(0);
39387 SDValue InVec = N->getOperand(0);
39388 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
39390 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
39391 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
39393 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
39394 if (OpVT.getScalarType() == MVT::i1)
39395 return DAG.getConstant(1, SDLoc(N), OpVT);
39396 return getOnesVector(OpVT, DAG, SDLoc(N));
39399 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
39400 return DAG.getBuildVector(
39402 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
39404 // If we're extracting the lowest subvector and we're the only user,
39405 // we may be able to perform this with a smaller vector width.
39406 if (IdxVal == 0 && InVec.hasOneUse()) {
39407 unsigned InOpcode = InVec.getOpcode();
39408 if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
39409 // v2f64 CVTDQ2PD(v4i32).
39410 if (InOpcode == ISD::SINT_TO_FP &&
39411 InVec.getOperand(0).getValueType() == MVT::v4i32) {
39412 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
39414 // v2f64 CVTPS2PD(v4f32).
39415 if (InOpcode == ISD::FP_EXTEND &&
39416 InVec.getOperand(0).getValueType() == MVT::v4f32) {
39417 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
39420 if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) &&
39421 OpVT.is128BitVector() &&
39422 InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
39423 unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG
39424 : ISD::SIGN_EXTEND_VECTOR_INREG;
39425 return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
39432 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
39433 EVT VT = N->getValueType(0);
39434 SDValue Src = N->getOperand(0);
39436 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
39437 // This occurs frequently in our masked scalar intrinsic code and our
39438 // floating point select lowering with AVX512.
39439 // TODO: SimplifyDemandedBits instead?
39440 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
39441 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
39442 if (C->getAPIntValue().isOneValue())
39443 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
39444 Src.getOperand(0));
39449 // Simplify PMULDQ and PMULUDQ operations.
39450 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
39451 TargetLowering::DAGCombinerInfo &DCI) {
39452 SDValue LHS = N->getOperand(0);
39453 SDValue RHS = N->getOperand(1);
39455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39456 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
39457 !DCI.isBeforeLegalizeOps());
39458 APInt DemandedMask(APInt::getLowBitsSet(64, 32));
39460 // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
39461 KnownBits LHSKnown;
39462 if (TLI.SimplifyDemandedBits(LHS, DemandedMask, LHSKnown, TLO)) {
39463 DCI.CommitTargetLoweringOpt(TLO);
39464 return SDValue(N, 0);
39467 KnownBits RHSKnown;
39468 if (TLI.SimplifyDemandedBits(RHS, DemandedMask, RHSKnown, TLO)) {
39469 DCI.CommitTargetLoweringOpt(TLO);
39470 return SDValue(N, 0);
39476 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
39477 DAGCombinerInfo &DCI) const {
39478 SelectionDAG &DAG = DCI.DAG;
39479 switch (N->getOpcode()) {
39481 case ISD::SCALAR_TO_VECTOR:
39482 return combineScalarToVector(N, DAG);
39483 case ISD::EXTRACT_VECTOR_ELT:
39484 case X86ISD::PEXTRW:
39485 case X86ISD::PEXTRB:
39486 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
39487 case ISD::INSERT_SUBVECTOR:
39488 return combineInsertSubvector(N, DAG, DCI, Subtarget);
39489 case ISD::EXTRACT_SUBVECTOR:
39490 return combineExtractSubvector(N, DAG, DCI, Subtarget);
39493 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
39494 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
39495 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
39496 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
39497 case ISD::SUB: return combineSub(N, DAG, Subtarget);
39498 case X86ISD::SBB: return combineSBB(N, DAG);
39499 case X86ISD::ADC: return combineADC(N, DAG, DCI);
39500 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
39503 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
39504 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
39505 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
39506 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
39507 case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
39508 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
39509 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
39510 case ISD::STORE: return combineStore(N, DAG, Subtarget);
39511 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
39512 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
39513 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
39515 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
39516 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
39517 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
39518 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
39519 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
39520 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
39522 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
39524 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
39526 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
39527 case X86ISD::BT: return combineBT(N, DAG, DCI);
39528 case ISD::ANY_EXTEND:
39529 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
39530 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
39531 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
39532 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
39533 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
39534 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
39535 case X86ISD::PACKSS:
39536 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
39537 case X86ISD::VSHLI:
39538 case X86ISD::VSRAI:
39539 case X86ISD::VSRLI:
39540 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
39541 case ISD::SIGN_EXTEND_VECTOR_INREG:
39542 case ISD::ZERO_EXTEND_VECTOR_INREG:
39543 case X86ISD::VSEXT:
39544 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
39545 case X86ISD::PINSRB:
39546 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
39547 case X86ISD::SHUFP: // Handle all target specific shuffles
39548 case X86ISD::INSERTPS:
39549 case X86ISD::EXTRQI:
39550 case X86ISD::INSERTQI:
39551 case X86ISD::PALIGNR:
39552 case X86ISD::VSHLDQ:
39553 case X86ISD::VSRLDQ:
39554 case X86ISD::BLENDI:
39555 case X86ISD::UNPCKH:
39556 case X86ISD::UNPCKL:
39557 case X86ISD::MOVHLPS:
39558 case X86ISD::MOVLHPS:
39559 case X86ISD::PSHUFB:
39560 case X86ISD::PSHUFD:
39561 case X86ISD::PSHUFHW:
39562 case X86ISD::PSHUFLW:
39563 case X86ISD::MOVSHDUP:
39564 case X86ISD::MOVSLDUP:
39565 case X86ISD::MOVDDUP:
39566 case X86ISD::MOVSS:
39567 case X86ISD::MOVSD:
39568 case X86ISD::VBROADCAST:
39569 case X86ISD::VPPERM:
39570 case X86ISD::VPERMI:
39571 case X86ISD::VPERMV:
39572 case X86ISD::VPERMV3:
39573 case X86ISD::VPERMIL2:
39574 case X86ISD::VPERMILPI:
39575 case X86ISD::VPERMILPV:
39576 case X86ISD::VPERM2X128:
39577 case X86ISD::SHUF128:
39578 case X86ISD::VZEXT_MOVL:
39579 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
39580 case X86ISD::FMADD_RND:
39581 case X86ISD::FMSUB:
39582 case X86ISD::FMSUB_RND:
39583 case X86ISD::FNMADD:
39584 case X86ISD::FNMADD_RND:
39585 case X86ISD::FNMSUB:
39586 case X86ISD::FNMSUB_RND:
39587 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
39588 case X86ISD::FMADDSUB_RND:
39589 case X86ISD::FMSUBADD_RND:
39590 case X86ISD::FMADDSUB:
39591 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
39592 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
39593 case X86ISD::MGATHER:
39594 case X86ISD::MSCATTER:
39596 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
39597 case X86ISD::PCMPEQ:
39598 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
39599 case X86ISD::PMULDQ:
39600 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
39606 /// Return true if the target has native support for the specified value type
39607 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
39608 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
39609 /// some i16 instructions are slow.
39610 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
39611 if (!isTypeLegal(VT))
39614 // There are no vXi8 shifts.
39615 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
39618 if (VT != MVT::i16)
39625 case ISD::SIGN_EXTEND:
39626 case ISD::ZERO_EXTEND:
39627 case ISD::ANY_EXTEND:
39640 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
39641 SDValue Value, SDValue Addr,
39642 SelectionDAG &DAG) const {
39643 const Module *M = DAG.getMachineFunction().getMMI().getModule();
39644 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
39645 if (IsCFProtectionSupported) {
39646 // In case control-flow branch protection is enabled, we need to add
39647 // notrack prefix to the indirect branch.
39648 // In order to do that we create NT_BRIND SDNode.
39649 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
39650 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
39653 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
39656 /// This method query the target whether it is beneficial for dag combiner to
39657 /// promote the specified node. If true, it should return the desired promotion
39658 /// type by reference.
39659 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
39660 EVT VT = Op.getValueType();
39661 if (VT != MVT::i16)
39664 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
39665 if (!Op.hasOneUse())
39667 SDNode *User = *Op->use_begin();
39668 if (!ISD::isNormalStore(User))
39670 auto *Ld = cast<LoadSDNode>(Load);
39671 auto *St = cast<StoreSDNode>(User);
39672 return Ld->getBasePtr() == St->getBasePtr();
39675 bool Commute = false;
39676 switch (Op.getOpcode()) {
39677 default: return false;
39678 case ISD::SIGN_EXTEND:
39679 case ISD::ZERO_EXTEND:
39680 case ISD::ANY_EXTEND:
39684 SDValue N0 = Op.getOperand(0);
39685 // Look out for (store (shl (load), x)).
39686 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
39698 SDValue N0 = Op.getOperand(0);
39699 SDValue N1 = Op.getOperand(1);
39700 // Avoid disabling potential load folding opportunities.
39701 if (MayFoldLoad(N1) &&
39702 (!Commute || !isa<ConstantSDNode>(N0) ||
39703 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
39705 if (MayFoldLoad(N0) &&
39706 ((Commute && !isa<ConstantSDNode>(N1)) ||
39707 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
39716 bool X86TargetLowering::
39717 isDesirableToCombineBuildVectorToShuffleTruncate(
39718 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
39720 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
39721 "Element count mismatch");
39723 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
39724 "Shuffle Mask expected to be legal");
39726 // For 32-bit elements VPERMD is better than shuffle+truncate.
39727 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
39728 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
39731 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
39737 //===----------------------------------------------------------------------===//
39738 // X86 Inline Assembly Support
39739 //===----------------------------------------------------------------------===//
39741 // Helper to match a string separated by whitespace.
39742 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
39743 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
39745 for (StringRef Piece : Pieces) {
39746 if (!S.startswith(Piece)) // Check if the piece matches.
39749 S = S.substr(Piece.size());
39750 StringRef::size_type Pos = S.find_first_not_of(" \t");
39751 if (Pos == 0) // We matched a prefix.
39760 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
39762 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
39763 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
39764 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
39765 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
39767 if (AsmPieces.size() == 3)
39769 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
39776 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
39777 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
39779 const std::string &AsmStr = IA->getAsmString();
39781 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
39782 if (!Ty || Ty->getBitWidth() % 16 != 0)
39785 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
39786 SmallVector<StringRef, 4> AsmPieces;
39787 SplitString(AsmStr, AsmPieces, ";\n");
39789 switch (AsmPieces.size()) {
39790 default: return false;
39792 // FIXME: this should verify that we are targeting a 486 or better. If not,
39793 // we will turn this bswap into something that will be lowered to logical
39794 // ops instead of emitting the bswap asm. For now, we don't support 486 or
39795 // lower so don't worry about this.
39797 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
39798 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
39799 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
39800 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
39801 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
39802 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
39803 // No need to check constraints, nothing other than the equivalent of
39804 // "=r,0" would be valid here.
39805 return IntrinsicLowering::LowerToByteSwap(CI);
39808 // rorw $$8, ${0:w} --> llvm.bswap.i16
39809 if (CI->getType()->isIntegerTy(16) &&
39810 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39811 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
39812 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
39814 StringRef ConstraintsStr = IA->getConstraintString();
39815 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39816 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39817 if (clobbersFlagRegisters(AsmPieces))
39818 return IntrinsicLowering::LowerToByteSwap(CI);
39822 if (CI->getType()->isIntegerTy(32) &&
39823 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39824 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
39825 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
39826 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
39828 StringRef ConstraintsStr = IA->getConstraintString();
39829 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39830 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39831 if (clobbersFlagRegisters(AsmPieces))
39832 return IntrinsicLowering::LowerToByteSwap(CI);
39835 if (CI->getType()->isIntegerTy(64)) {
39836 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
39837 if (Constraints.size() >= 2 &&
39838 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
39839 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
39840 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
39841 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
39842 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
39843 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
39844 return IntrinsicLowering::LowerToByteSwap(CI);
39852 /// Given a constraint letter, return the type of constraint for this target.
39853 X86TargetLowering::ConstraintType
39854 X86TargetLowering::getConstraintType(StringRef Constraint) const {
39855 if (Constraint.size() == 1) {
39856 switch (Constraint[0]) {
39868 case 'k': // AVX512 masking registers.
39869 return C_RegisterClass;
39893 else if (Constraint.size() == 2) {
39894 switch (Constraint[0]) {
39898 switch (Constraint[1]) {
39909 return C_RegisterClass;
39913 return TargetLowering::getConstraintType(Constraint);
39916 /// Examine constraint type and operand type and determine a weight value.
39917 /// This object must already have been set up with the operand type
39918 /// and the current alternative constraint selected.
39919 TargetLowering::ConstraintWeight
39920 X86TargetLowering::getSingleConstraintMatchWeight(
39921 AsmOperandInfo &info, const char *constraint) const {
39922 ConstraintWeight weight = CW_Invalid;
39923 Value *CallOperandVal = info.CallOperandVal;
39924 // If we don't have a value, we can't do a match,
39925 // but allow it at the lowest weight.
39926 if (!CallOperandVal)
39928 Type *type = CallOperandVal->getType();
39929 // Look at the constraint type.
39930 switch (*constraint) {
39932 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
39944 if (CallOperandVal->getType()->isIntegerTy())
39945 weight = CW_SpecificReg;
39950 if (type->isFloatingPointTy())
39951 weight = CW_SpecificReg;
39954 if (type->isX86_MMXTy() && Subtarget.hasMMX())
39955 weight = CW_SpecificReg;
39958 unsigned Size = StringRef(constraint).size();
39959 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
39960 char NextChar = Size == 2 ? constraint[1] : 'i';
39963 switch (NextChar) {
39969 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
39970 return CW_SpecificReg;
39972 // Conditional OpMask regs (AVX512)
39974 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
39975 return CW_Register;
39979 if (type->isX86_MMXTy() && Subtarget.hasMMX())
39982 // Any SSE reg when ISA >= SSE2, same as 'Y'
39986 if (!Subtarget.hasSSE2())
39990 // Fall through (handle "Y" constraint).
39994 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
39995 weight = CW_Register;
39998 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
39999 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
40000 weight = CW_Register;
40003 // Enable conditional vector operations using %k<#> registers.
40004 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
40005 weight = CW_Register;
40008 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
40009 if (C->getZExtValue() <= 31)
40010 weight = CW_Constant;
40014 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40015 if (C->getZExtValue() <= 63)
40016 weight = CW_Constant;
40020 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40021 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
40022 weight = CW_Constant;
40026 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40027 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
40028 weight = CW_Constant;
40032 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40033 if (C->getZExtValue() <= 3)
40034 weight = CW_Constant;
40038 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40039 if (C->getZExtValue() <= 0xff)
40040 weight = CW_Constant;
40045 if (isa<ConstantFP>(CallOperandVal)) {
40046 weight = CW_Constant;
40050 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40051 if ((C->getSExtValue() >= -0x80000000LL) &&
40052 (C->getSExtValue() <= 0x7fffffffLL))
40053 weight = CW_Constant;
40057 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40058 if (C->getZExtValue() <= 0xffffffff)
40059 weight = CW_Constant;
40066 /// Try to replace an X constraint, which matches anything, with another that
40067 /// has more specific requirements based on the type of the corresponding
40069 const char *X86TargetLowering::
40070 LowerXConstraint(EVT ConstraintVT) const {
40071 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
40072 // 'f' like normal targets.
40073 if (ConstraintVT.isFloatingPoint()) {
40074 if (Subtarget.hasSSE2())
40076 if (Subtarget.hasSSE1())
40080 return TargetLowering::LowerXConstraint(ConstraintVT);
40083 /// Lower the specified operand into the Ops vector.
40084 /// If it is invalid, don't add anything to Ops.
40085 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
40086 std::string &Constraint,
40087 std::vector<SDValue>&Ops,
40088 SelectionDAG &DAG) const {
40091 // Only support length 1 constraints for now.
40092 if (Constraint.length() > 1) return;
40094 char ConstraintLetter = Constraint[0];
40095 switch (ConstraintLetter) {
40098 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40099 if (C->getZExtValue() <= 31) {
40100 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40101 Op.getValueType());
40107 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40108 if (C->getZExtValue() <= 63) {
40109 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40110 Op.getValueType());
40116 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40117 if (isInt<8>(C->getSExtValue())) {
40118 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40119 Op.getValueType());
40125 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40126 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
40127 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
40128 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
40129 Op.getValueType());
40135 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40136 if (C->getZExtValue() <= 3) {
40137 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40138 Op.getValueType());
40144 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40145 if (C->getZExtValue() <= 255) {
40146 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40147 Op.getValueType());
40153 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40154 if (C->getZExtValue() <= 127) {
40155 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40156 Op.getValueType());
40162 // 32-bit signed value
40163 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40164 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
40165 C->getSExtValue())) {
40166 // Widen to 64 bits here to get it sign extended.
40167 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
40170 // FIXME gcc accepts some relocatable values here too, but only in certain
40171 // memory models; it's complicated.
40176 // 32-bit unsigned value
40177 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40178 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
40179 C->getZExtValue())) {
40180 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40181 Op.getValueType());
40185 // FIXME gcc accepts some relocatable values here too, but only in certain
40186 // memory models; it's complicated.
40190 // Literal immediates are always ok.
40191 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
40192 // Widen to 64 bits here to get it sign extended.
40193 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
40197 // In any sort of PIC mode addresses need to be computed at runtime by
40198 // adding in a register or some sort of table lookup. These can't
40199 // be used as immediates.
40200 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
40203 // If we are in non-pic codegen mode, we allow the address of a global (with
40204 // an optional displacement) to be used with 'i'.
40205 GlobalAddressSDNode *GA = nullptr;
40206 int64_t Offset = 0;
40208 // Match either (GA), (GA+C), (GA+C1+C2), etc.
40210 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
40211 Offset += GA->getOffset();
40213 } else if (Op.getOpcode() == ISD::ADD) {
40214 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
40215 Offset += C->getZExtValue();
40216 Op = Op.getOperand(0);
40219 } else if (Op.getOpcode() == ISD::SUB) {
40220 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
40221 Offset += -C->getZExtValue();
40222 Op = Op.getOperand(0);
40227 // Otherwise, this isn't something we can handle, reject it.
40231 const GlobalValue *GV = GA->getGlobal();
40232 // If we require an extra load to get this address, as in PIC mode, we
40233 // can't accept it.
40234 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
40237 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
40238 GA->getValueType(0), Offset);
40243 if (Result.getNode()) {
40244 Ops.push_back(Result);
40247 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
40250 /// Check if \p RC is a general purpose register class.
40251 /// I.e., GR* or one of their variant.
40252 static bool isGRClass(const TargetRegisterClass &RC) {
40253 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
40254 RC.hasSuperClassEq(&X86::GR16RegClass) ||
40255 RC.hasSuperClassEq(&X86::GR32RegClass) ||
40256 RC.hasSuperClassEq(&X86::GR64RegClass) ||
40257 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
40260 /// Check if \p RC is a vector register class.
40261 /// I.e., FR* / VR* or one of their variant.
40262 static bool isFRClass(const TargetRegisterClass &RC) {
40263 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
40264 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
40265 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
40266 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
40267 RC.hasSuperClassEq(&X86::VR512RegClass);
40270 std::pair<unsigned, const TargetRegisterClass *>
40271 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
40272 StringRef Constraint,
40274 // First, see if this is a constraint that directly corresponds to an LLVM
40276 if (Constraint.size() == 1) {
40277 // GCC Constraint Letters
40278 switch (Constraint[0]) {
40280 // TODO: Slight differences here in allocation order and leaving
40281 // RIP in the class. Do they matter any more here than they do
40282 // in the normal allocation?
40284 if (Subtarget.hasAVX512()) {
40285 // Only supported in AVX512 or later.
40286 switch (VT.SimpleTy) {
40289 return std::make_pair(0U, &X86::VK32RegClass);
40291 return std::make_pair(0U, &X86::VK16RegClass);
40293 return std::make_pair(0U, &X86::VK8RegClass);
40295 return std::make_pair(0U, &X86::VK1RegClass);
40297 return std::make_pair(0U, &X86::VK64RegClass);
40301 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
40302 if (Subtarget.is64Bit()) {
40303 if (VT == MVT::i32 || VT == MVT::f32)
40304 return std::make_pair(0U, &X86::GR32RegClass);
40305 if (VT == MVT::i16)
40306 return std::make_pair(0U, &X86::GR16RegClass);
40307 if (VT == MVT::i8 || VT == MVT::i1)
40308 return std::make_pair(0U, &X86::GR8RegClass);
40309 if (VT == MVT::i64 || VT == MVT::f64)
40310 return std::make_pair(0U, &X86::GR64RegClass);
40314 // 32-bit fallthrough
40315 case 'Q': // Q_REGS
40316 if (VT == MVT::i32 || VT == MVT::f32)
40317 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
40318 if (VT == MVT::i16)
40319 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
40320 if (VT == MVT::i8 || VT == MVT::i1)
40321 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
40322 if (VT == MVT::i64)
40323 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
40325 case 'r': // GENERAL_REGS
40326 case 'l': // INDEX_REGS
40327 if (VT == MVT::i8 || VT == MVT::i1)
40328 return std::make_pair(0U, &X86::GR8RegClass);
40329 if (VT == MVT::i16)
40330 return std::make_pair(0U, &X86::GR16RegClass);
40331 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
40332 return std::make_pair(0U, &X86::GR32RegClass);
40333 return std::make_pair(0U, &X86::GR64RegClass);
40334 case 'R': // LEGACY_REGS
40335 if (VT == MVT::i8 || VT == MVT::i1)
40336 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
40337 if (VT == MVT::i16)
40338 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
40339 if (VT == MVT::i32 || !Subtarget.is64Bit())
40340 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
40341 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
40342 case 'f': // FP Stack registers.
40343 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
40344 // value to the correct fpstack register class.
40345 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
40346 return std::make_pair(0U, &X86::RFP32RegClass);
40347 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
40348 return std::make_pair(0U, &X86::RFP64RegClass);
40349 return std::make_pair(0U, &X86::RFP80RegClass);
40350 case 'y': // MMX_REGS if MMX allowed.
40351 if (!Subtarget.hasMMX()) break;
40352 return std::make_pair(0U, &X86::VR64RegClass);
40353 case 'Y': // SSE_REGS if SSE2 allowed
40354 if (!Subtarget.hasSSE2()) break;
40357 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
40358 if (!Subtarget.hasSSE1()) break;
40359 bool VConstraint = (Constraint[0] == 'v');
40361 switch (VT.SimpleTy) {
40363 // Scalar SSE types.
40366 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
40367 return std::make_pair(0U, &X86::FR32XRegClass);
40368 return std::make_pair(0U, &X86::FR32RegClass);
40371 if (VConstraint && Subtarget.hasVLX())
40372 return std::make_pair(0U, &X86::FR64XRegClass);
40373 return std::make_pair(0U, &X86::FR64RegClass);
40374 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40382 if (VConstraint && Subtarget.hasVLX())
40383 return std::make_pair(0U, &X86::VR128XRegClass);
40384 return std::make_pair(0U, &X86::VR128RegClass);
40392 if (VConstraint && Subtarget.hasVLX())
40393 return std::make_pair(0U, &X86::VR256XRegClass);
40394 return std::make_pair(0U, &X86::VR256RegClass);
40399 return std::make_pair(0U, &X86::VR512RegClass);
40403 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
40404 switch (Constraint[1]) {
40410 return getRegForInlineAsmConstraint(TRI, "Y", VT);
40412 if (!Subtarget.hasMMX()) break;
40413 return std::make_pair(0U, &X86::VR64RegClass);
40416 if (!Subtarget.hasSSE1()) break;
40417 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
40419 // This register class doesn't allocate k0 for masked vector operation.
40420 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
40421 switch (VT.SimpleTy) {
40424 return std::make_pair(0U, &X86::VK32WMRegClass);
40426 return std::make_pair(0U, &X86::VK16WMRegClass);
40428 return std::make_pair(0U, &X86::VK8WMRegClass);
40430 return std::make_pair(0U, &X86::VK1WMRegClass);
40432 return std::make_pair(0U, &X86::VK64WMRegClass);
40439 // Use the default implementation in TargetLowering to convert the register
40440 // constraint into a member of a register class.
40441 std::pair<unsigned, const TargetRegisterClass*> Res;
40442 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
40444 // Not found as a standard register?
40446 // Map st(0) -> st(7) -> ST0
40447 if (Constraint.size() == 7 && Constraint[0] == '{' &&
40448 tolower(Constraint[1]) == 's' &&
40449 tolower(Constraint[2]) == 't' &&
40450 Constraint[3] == '(' &&
40451 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
40452 Constraint[5] == ')' &&
40453 Constraint[6] == '}') {
40455 Res.first = X86::FP0+Constraint[4]-'0';
40456 Res.second = &X86::RFP80RegClass;
40460 // GCC allows "st(0)" to be called just plain "st".
40461 if (StringRef("{st}").equals_lower(Constraint)) {
40462 Res.first = X86::FP0;
40463 Res.second = &X86::RFP80RegClass;
40468 if (StringRef("{flags}").equals_lower(Constraint)) {
40469 Res.first = X86::EFLAGS;
40470 Res.second = &X86::CCRRegClass;
40474 // 'A' means [ER]AX + [ER]DX.
40475 if (Constraint == "A") {
40476 if (Subtarget.is64Bit()) {
40477 Res.first = X86::RAX;
40478 Res.second = &X86::GR64_ADRegClass;
40480 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
40481 "Expecting 64, 32 or 16 bit subtarget");
40482 Res.first = X86::EAX;
40483 Res.second = &X86::GR32_ADRegClass;
40490 // Make sure it isn't a register that requires 64-bit mode.
40491 if (!Subtarget.is64Bit() &&
40492 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
40493 TRI->getEncodingValue(Res.first) >= 8) {
40494 // Register requires REX prefix, but we're in 32-bit mode.
40496 Res.second = nullptr;
40500 // Make sure it isn't a register that requires AVX512.
40501 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
40502 TRI->getEncodingValue(Res.first) & 0x10) {
40503 // Register requires EVEX prefix.
40505 Res.second = nullptr;
40509 // Otherwise, check to see if this is a register class of the wrong value
40510 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
40511 // turn into {ax},{dx}.
40512 // MVT::Other is used to specify clobber names.
40513 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
40514 return Res; // Correct type already, nothing to do.
40516 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
40517 // return "eax". This should even work for things like getting 64bit integer
40518 // registers when given an f64 type.
40519 const TargetRegisterClass *Class = Res.second;
40520 // The generic code will match the first register class that contains the
40521 // given register. Thus, based on the ordering of the tablegened file,
40522 // the "plain" GR classes might not come first.
40523 // Therefore, use a helper method.
40524 if (isGRClass(*Class)) {
40525 unsigned Size = VT.getSizeInBits();
40526 if (Size == 1) Size = 8;
40527 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
40529 bool is64Bit = Subtarget.is64Bit();
40530 const TargetRegisterClass *RC =
40531 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
40532 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
40533 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
40534 : &X86::GR64RegClass;
40535 if (RC->contains(DestReg))
40536 Res = std::make_pair(DestReg, RC);
40538 // No register found/type mismatch.
40540 Res.second = nullptr;
40542 } else if (isFRClass(*Class)) {
40543 // Handle references to XMM physical registers that got mapped into the
40544 // wrong class. This can happen with constraints like {xmm0} where the
40545 // target independent register mapper will just pick the first match it can
40546 // find, ignoring the required type.
40548 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40549 if (VT == MVT::f32 || VT == MVT::i32)
40550 Res.second = &X86::FR32RegClass;
40551 else if (VT == MVT::f64 || VT == MVT::i64)
40552 Res.second = &X86::FR64RegClass;
40553 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
40554 Res.second = &X86::VR128RegClass;
40555 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
40556 Res.second = &X86::VR256RegClass;
40557 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
40558 Res.second = &X86::VR512RegClass;
40560 // Type mismatch and not a clobber: Return an error;
40562 Res.second = nullptr;
40569 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
40570 const AddrMode &AM, Type *Ty,
40571 unsigned AS) const {
40572 // Scaling factors are not free at all.
40573 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
40574 // will take 2 allocations in the out of order engine instead of 1
40575 // for plain addressing mode, i.e. inst (reg1).
40577 // vaddps (%rsi,%rdx), %ymm0, %ymm1
40578 // Requires two allocations (one for the load, one for the computation)
40580 // vaddps (%rsi), %ymm0, %ymm1
40581 // Requires just 1 allocation, i.e., freeing allocations for other operations
40582 // and having less micro operations to execute.
40584 // For some X86 architectures, this is even worse because for instance for
40585 // stores, the complex addressing mode forces the instruction to use the
40586 // "load" ports instead of the dedicated "store" port.
40587 // E.g., on Haswell:
40588 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
40589 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
40590 if (isLegalAddressingMode(DL, AM, Ty, AS))
40591 // Scale represents reg2 * scale, thus account for 1
40592 // as soon as we use a second register.
40593 return AM.Scale != 0;
40597 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
40598 // Integer division on x86 is expensive. However, when aggressively optimizing
40599 // for code size, we prefer to use a div instruction, as it is usually smaller
40600 // than the alternative sequence.
40601 // The exception to this is vector division. Since x86 doesn't have vector
40602 // integer division, leaving the division as-is is a loss even in terms of
40603 // size, because it will have to be scalarized, while the alternative code
40604 // sequence can be performed in vector form.
40606 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
40607 return OptSize && !VT.isVector();
40610 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
40611 if (!Subtarget.is64Bit())
40614 // Update IsSplitCSR in X86MachineFunctionInfo.
40615 X86MachineFunctionInfo *AFI =
40616 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
40617 AFI->setIsSplitCSR(true);
40620 void X86TargetLowering::insertCopiesSplitCSR(
40621 MachineBasicBlock *Entry,
40622 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
40623 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
40624 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
40628 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
40629 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
40630 MachineBasicBlock::iterator MBBI = Entry->begin();
40631 for (const MCPhysReg *I = IStart; *I; ++I) {
40632 const TargetRegisterClass *RC = nullptr;
40633 if (X86::GR64RegClass.contains(*I))
40634 RC = &X86::GR64RegClass;
40636 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
40638 unsigned NewVR = MRI->createVirtualRegister(RC);
40639 // Create copy from CSR to a virtual register.
40640 // FIXME: this currently does not emit CFI pseudo-instructions, it works
40641 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
40642 // nounwind. If we want to generalize this later, we may need to emit
40643 // CFI pseudo-instructions.
40644 assert(Entry->getParent()->getFunction().hasFnAttribute(
40645 Attribute::NoUnwind) &&
40646 "Function should be nounwind in insertCopiesSplitCSR!");
40647 Entry->addLiveIn(*I);
40648 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
40651 // Insert the copy-back instructions right before the terminator.
40652 for (auto *Exit : Exits)
40653 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
40654 TII->get(TargetOpcode::COPY), *I)
40659 bool X86TargetLowering::supportSwiftError() const {
40660 return Subtarget.is64Bit();
40663 /// Returns the name of the symbol used to emit stack probes or the empty
40664 /// string if not applicable.
40665 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
40666 // If the function specifically requests stack probes, emit them.
40667 if (MF.getFunction().hasFnAttribute("probe-stack"))
40668 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
40670 // Generally, if we aren't on Windows, the platform ABI does not include
40671 // support for stack probes, so don't emit them.
40672 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
40673 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
40676 // We need a stack probe to conform to the Windows ABI. Choose the right
40678 if (Subtarget.is64Bit())
40679 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
40680 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";